Added utils.py, with a quality HTML stripper; removed stripHtml from other source files

This commit is contained in:
Jeremy Fincher 2003-03-27 20:10:10 +00:00
parent f355465ad6
commit fc20715427
4 changed files with 77 additions and 13 deletions

View File

@ -40,15 +40,12 @@ from baseplugin import *
import re import re
import urllib2 import urllib2
import utils
import debug import debug
import ircmsgs import ircmsgs
import ircutils import ircutils
import callbacks import callbacks
htmlStripper = re.compile(r'<[^>]+>')
def stripHtml(s):
return htmlStripper.sub('', s)
class Forums(callbacks.PrivmsgRegexp): class Forums(callbacks.PrivmsgRegexp):
threaded = True threaded = True
_ggThread = re.compile(r'from thread &quot;<b>(.*?)</b>&quot;') _ggThread = re.compile(r'from thread &quot;<b>(.*?)</b>&quot;')

View File

@ -48,17 +48,13 @@ import re
import time import time
import urllib import urllib
import urllib2 import urllib2
import htmlentitydefs
import xml.dom.minidom import xml.dom.minidom
import utils
import debug import debug
import privmsgs import privmsgs
import callbacks import callbacks
_htmlstripper = re.compile('<[^>]+>')
def stripHtml(s):
return _htmlstripper.sub('', s)
class FreshmeatException(Exception): class FreshmeatException(Exception):
pass pass
@ -132,7 +128,7 @@ class Http(callbacks.Privmsg):
text = html.split('<P>\n', 2)[1] text = html.split('<P>\n', 2)[1]
text = text.replace('.\n', '. ') text = text.replace('.\n', '. ')
text = text.replace('\n', ' ') text = text.replace('\n', ' ')
text = stripHtml(text) text = utils.htmlToText(text)
irc.reply(msg, text.strip()) irc.reply(msg, text.strip())
_gkrating = re.compile(r'<font color="#FFFF33">(\d+)</font>') _gkrating = re.compile(r'<font color="#FFFF33">(\d+)</font>')
@ -246,10 +242,8 @@ class Http(callbacks.Privmsg):
if m is None: if m is None:
irc.error(msg, 'No quote found.') irc.error(msg, 'No quote found.')
return return
quote = m.group(1) quote = utils.htmlToText(m.group(1))
quote = ' // '.join(quote.splitlines()) quote = ' // '.join(quote.splitlines())
for (entity, replacement) in htmlentitydefs.entitydefs.iteritems():
quote = quote.replace(entity, replacement)
irc.reply(msg, quote) irc.reply(msg, quote)
_acronymre = re.compile('<td[^>]*><b>[^<]+</b></td>[^<]+<td[^>]*>(\w+)') _acronymre = re.compile('<td[^>]*><b>[^<]+</b></td>[^<]+<td[^>]*>(\w+)')

View File

@ -59,6 +59,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
callbacks.Privmsg.__init__(self) callbacks.Privmsg.__init__(self)
self.ircs = {} self.ircs = {}
self.started = False self.started = False
self.channels = set()
self.abbreviations = {} self.abbreviations = {}
def startrelay(self, irc, msg, args): def startrelay(self, irc, msg, args):
@ -97,6 +98,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
def relayjoin(self, irc, msg, args): def relayjoin(self, irc, msg, args):
"<channel>" "<channel>"
channel = privmsgs.getArgs(args) channel = privmsgs.getArgs(args)
self.channels.add(channel)
for otherIrc in self.ircs.itervalues(): for otherIrc in self.ircs.itervalues():
if channel not in otherIrc.state.channels: if channel not in otherIrc.state.channels:
otherIrc.queueMsg(ircmsgs.join(channel)) otherIrc.queueMsg(ircmsgs.join(channel))
@ -105,6 +107,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
def relaypart(self, irc, msg, args): def relaypart(self, irc, msg, args):
"<channel>" "<channel>"
channel = privmsgs.getArgs(args) channel = privmsgs.getArgs(args)
self.channels.remove(channel)
for otherIrc in self.ircs.itervalues(): for otherIrc in self.ircs.itervalues():
if channel in otherIrc.state.channels: if channel in otherIrc.state.channels:
otherIrc.queueMsg(ircmsgs.part(channel)) otherIrc.queueMsg(ircmsgs.part(channel))
@ -122,6 +125,8 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
irc = irc.getRealIrc() irc = irc.getRealIrc()
if self.started and ircutils.isChannel(msg.args[0]): if self.started and ircutils.isChannel(msg.args[0]):
channel = msg.args[0] channel = msg.args[0]
if channel not in self.channels:
return
#debug.printf('self.abbreviations = %s' % self.abbreviations) #debug.printf('self.abbreviations = %s' % self.abbreviations)
#debug.printf('self.ircs = %s' % self.ircs) #debug.printf('self.ircs = %s' % self.ircs)
#debug.printf('irc = %s' % irc) #debug.printf('irc = %s' % irc)
@ -171,6 +176,8 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
rAction = re.compile(r'\* \w+/(?:%s) ' % '|'.join(abbreviations)) rAction = re.compile(r'\* \w+/(?:%s) ' % '|'.join(abbreviations))
if not (rPrivmsg.match(msg.args[1]) or rAction.match(msg.args[1])): if not (rPrivmsg.match(msg.args[1]) or rAction.match(msg.args[1])):
channel = msg.args[0] channel = msg.args[0]
if channel not in self.channels:
return
abbreviation = self.abbreviations[irc] abbreviation = self.abbreviations[irc]
s = self._formatPrivmsg(irc.nick, abbreviation, msg) s = self._formatPrivmsg(irc.nick, abbreviation, msg)
for otherIrc in self.ircs.itervalues(): for otherIrc in self.ircs.itervalues():

66
src/utils.py Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
###
# Copyright (c) 2002, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Simple utility functions.
"""
from fix import *
import sgmllib
import htmlentitydefs
class HtmlToText(sgmllib.SGMLParser):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs
def __init__(self):
self.data = []
sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attrib):
self.data.append(" ")
def unknown_endtag(self, tag):
self.data.append(" ")
def handle_data(self, data):
self.data.append(data)
def getText(self):
text = ''.join(self.data).strip()
return ''.join(text.split()) # normalize whitespace
def htmlToText(s):
x = HtmlToText()
x.feed(s)
return x.getText()
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: