Added utils.py, with a quality HTML stripper; removed stripHtml from other source files

This commit is contained in:
Jeremy Fincher 2003-03-27 20:10:10 +00:00
parent f355465ad6
commit fc20715427
4 changed files with 77 additions and 13 deletions

View File

@ -40,15 +40,12 @@ from baseplugin import *
import re
import urllib2
import utils
import debug
import ircmsgs
import ircutils
import callbacks
htmlStripper = re.compile(r'<[^>]+>')
def stripHtml(s):
return htmlStripper.sub('', s)
class Forums(callbacks.PrivmsgRegexp):
threaded = True
_ggThread = re.compile(r'from thread &quot;<b>(.*?)</b>&quot;')

View File

@ -48,17 +48,13 @@ import re
import time
import urllib
import urllib2
import htmlentitydefs
import xml.dom.minidom
import utils
import debug
import privmsgs
import callbacks
_htmlstripper = re.compile('<[^>]+>')
def stripHtml(s):
return _htmlstripper.sub('', s)
class FreshmeatException(Exception):
pass
@ -132,7 +128,7 @@ class Http(callbacks.Privmsg):
text = html.split('<P>\n', 2)[1]
text = text.replace('.\n', '. ')
text = text.replace('\n', ' ')
text = stripHtml(text)
text = utils.htmlToText(text)
irc.reply(msg, text.strip())
_gkrating = re.compile(r'<font color="#FFFF33">(\d+)</font>')
@ -246,10 +242,8 @@ class Http(callbacks.Privmsg):
if m is None:
irc.error(msg, 'No quote found.')
return
quote = m.group(1)
quote = utils.htmlToText(m.group(1))
quote = ' // '.join(quote.splitlines())
for (entity, replacement) in htmlentitydefs.entitydefs.iteritems():
quote = quote.replace(entity, replacement)
irc.reply(msg, quote)
_acronymre = re.compile('<td[^>]*><b>[^<]+</b></td>[^<]+<td[^>]*>(\w+)')

View File

@ -59,6 +59,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
callbacks.Privmsg.__init__(self)
self.ircs = {}
self.started = False
self.channels = set()
self.abbreviations = {}
def startrelay(self, irc, msg, args):
@ -97,6 +98,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
def relayjoin(self, irc, msg, args):
"<channel>"
channel = privmsgs.getArgs(args)
self.channels.add(channel)
for otherIrc in self.ircs.itervalues():
if channel not in otherIrc.state.channels:
otherIrc.queueMsg(ircmsgs.join(channel))
@ -105,6 +107,7 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
def relaypart(self, irc, msg, args):
"<channel>"
channel = privmsgs.getArgs(args)
self.channels.remove(channel)
for otherIrc in self.ircs.itervalues():
if channel in otherIrc.state.channels:
otherIrc.queueMsg(ircmsgs.part(channel))
@ -122,6 +125,8 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
irc = irc.getRealIrc()
if self.started and ircutils.isChannel(msg.args[0]):
channel = msg.args[0]
if channel not in self.channels:
return
#debug.printf('self.abbreviations = %s' % self.abbreviations)
#debug.printf('self.ircs = %s' % self.ircs)
#debug.printf('irc = %s' % irc)
@ -171,6 +176,8 @@ class Relay(privmsgs.CapabilityCheckingPrivmsg):
rAction = re.compile(r'\* \w+/(?:%s) ' % '|'.join(abbreviations))
if not (rPrivmsg.match(msg.args[1]) or rAction.match(msg.args[1])):
channel = msg.args[0]
if channel not in self.channels:
return
abbreviation = self.abbreviations[irc]
s = self._formatPrivmsg(irc.nick, abbreviation, msg)
for otherIrc in self.ircs.itervalues():

66
src/utils.py Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
###
# Copyright (c) 2002, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Simple utility functions.
"""
from fix import *
import sgmllib
import htmlentitydefs
class HtmlToText(sgmllib.SGMLParser):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs
def __init__(self):
self.data = []
sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attrib):
self.data.append(" ")
def unknown_endtag(self, tag):
self.data.append(" ")
def handle_data(self, data):
self.data.append(data)
def getText(self):
text = ''.join(self.data).strip()
return ''.join(text.split()) # normalize whitespace
def htmlToText(s):
x = HtmlToText()
x.feed(s)
return x.getText()
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: