diff --git a/others/BeautifulSoup.py b/others/BeautifulSoup.py
new file mode 100644
index 000000000..4fb3550fe
--- /dev/null
+++ b/others/BeautifulSoup.py
@@ -0,0 +1,449 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+
+The BeautifulSoup class turns arbitrarily bad HTML into a tree-like
+nested tag-soup list of Tag objects and text snippets. A Tag object
+corresponds to an HTML tag. It knows about the HTML tag's attributes,
+and contains a representation of everything contained between the
+original tag and its closing tag (if any). It's easy to extract Tags
+that meet certain criteria.
+
+A well-formed HTML document will yield a well-formed data
+structure. An ill-formed HTML document will yield a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this to process the well-formed part of it.
+
+#Example:
+#--------
+from BeautifulSoup import BeautifulSoup
+text = '''
+
The Title
+
+Link text (italicized)
+Link text 2
+
+'''
+soup = BeautifulSoup()
+soup.feed(text)
+print soup("a") #Returns a list of 2 Tag objects, one for each link in
+ #the source
+print soup.first("a", {'class':'foo'})['href'] #Returns http://www.crummy.com/
+print soup.first("title").contents[0] #Returns "The title"
+print soup.first("a", {'href':'http://www.crummy.com/'}).first("i").contents[0]
+#Returns "text (italicized)"
+
+#Example of SQL-style attribute wildcards -- all four 'find' calls will
+#find the link.
+#----------------------------------------------------------------------
+soup = BeautifulSoup()
+soup.feed('''bla''')
+print soup.fetch('a', {'href': 'http://foo.com/'})
+print soup.fetch('a', {'href': 'http://%'})
+print soup.fetch('a', {'href': '%.com/'})
+print soup.fetch('a', {'href': '%o.c%'})
+
+#Example with horrible HTML:
+#---------------------------
+soup = BeautifulSoup()
+soup.feed('''
+Go here
+or go Home
+''')
+print soup.fetch('a') #Returns a list of 2 Tag objects.
+print soup.first(attrs={'href': 'here.html'})['class'] #Returns "that"
+print soup.first(attrs={'class': 'that'}).first('i').contents[0] #returns "here"
+
+This library has no external dependencies. It works with Python 1.5.2
+and up. If you can install a Python extension, you might want to use
+the ElementTree Tidy HTML Tree Builder instead:
+ http://www.effbot.org/zone/element-tidylib.htm
+
+You can use BeautifulSoup on any SGML-like substance, such as XML or a
+domain-specific language that looks like HTML but has different tag
+names. For such purposes you may want to use the BeautifulStoneSoup
+class, which knows nothing at all about HTML per se. I also reserve
+the right to make the BeautifulSoup parser smarter between releases,
+so if you want forwards-compatibility without having to think about
+it, you might want to go with BeautifulStoneSoup.
+
+Release status:
+
+(I do a new release whenever I make a change that breaks backwards
+compatibility.)
+
+Current release:
+
+ Applied patch from Richie Hindle (richie at entrian dot com) that
+ makes tag.string a shorthand for tag.contents[0].string when the tag
+ has only one string-owning child.
+
+1.2 "Who for such dainties would not stoop?" (2004/07/08): Applied
+ patch from Ben Last (ben at benlast dot com) that made
+ Tag.renderContents() correctly handle Unicode.
+
+ Made BeautifulStoneSoup even dumber by making it not implicitly
+ close a tag when another tag of the same type is encountered; only
+ when an actual closing tag is encountered. This change courtesy of
+ Fuzzy (mike at pcblokes dot com). BeautifulSoup still works as
+ before.
+
+1.1 "Swimming in a hot tureen": Added more 'nestable' tags. Changed
+ popping semantics so that when a nestable tag is encountered, tags are
+ popped up to the previously encountered nestable tag (of whatever kind).
+ I will revert this if enough people complain, but it should make
+ more people's lives easier than harder.
+
+ This enhancement was suggested by Anthony Baxter (anthony at
+ interlink dot com dot au).
+
+1.0 "So rich and green": Initial release.
+
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "1.1 $Revision$"
+__date__ = "$Date$"
+__copyright__ = "Copyright (c) 2004 Leonard Richardson"
+__license__ = "Python"
+
+from sgmllib import SGMLParser
+import string
+import types
+
+class PageElement:
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def __init__(self, parent=None, previous=None):
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+
+class NavigableText(PageElement):
+
+ """A simple wrapper around a string that keeps track of where in
+ the document the string was found. Doesn't implement all the
+ string methods because I'm lazy. You could have this extend
+ UserString if you were using 2.2."""
+
+ def __init__(self, string, parent=None, previous=None):
+ PageElement.__init__(self, parent, previous)
+ self.string = string
+
+ def __eq__(self, other):
+ return self.string == str(other)
+
+ def __str__(self):
+ return self.string
+
+ def strip(self):
+ return self.string.strip()
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def __init__(self, name, attrs={}, parent=None, previous=None):
+ PageElement.__init__(self, parent, previous)
+ self.name = name
+ self.attrs = attrs
+ self.contents = []
+ self.foundClose = 0
+
+ def get(self, key, default=None):
+ return self._getAttrMap().get(key, default)
+
+ def __call__(self, *args):
+ return apply(self.fetch, args)
+
+ def __getitem__(self, key):
+ return self._getAttrMap()[key]
+
+ def __setitem__(self, key, value):
+ self._getAttrMap()
+ self.attrMap[key] = value
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+
+ def _getAttrMap(self):
+ if not hasattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ def __repr__(self):
+ return str(self)
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __eq__(self, other):
+ if not isinstance(other, Tag) or self.name != other.name or self.attrs != other.attrs or len(self.contents) != len(other.contents):
+ return 0
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return 0
+ return 1
+
+ def __str__(self):
+ attrs = ''
+ if self.attrs:
+ for key, val in self.attrs:
+ attrs = attrs + ' %s="%s"' % (key, val)
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing():
+ close = ' /'
+ elif self.foundClose:
+ closeTag = '%s>' % self.name
+ s = self.renderContents()
+ if not hasattr(self, 'hideTag'):
+ s = '<%s%s%s>' % (self.name, attrs, close) + s + closeTag
+ return s
+
+ def renderContents(self):
+ s='' #non-Unicode
+ for c in self.contents:
+ try:
+ s = s + str(c)
+ except UnicodeEncodeError:
+ if type(s) <> types.UnicodeType:
+ s = s.decode('utf8') #convert ascii to Unicode
+ #str() should, strictly speaking, not return a Unicode
+ #string, but NavigableText never checks and will return
+ #Unicode data if it was initialised with it.
+ s = s + str(c)
+ return s
+
+ def isSelfClosing(self):
+ return self.name in BeautifulSoup.SELF_CLOSING_TAGS
+
+ def append(self, tag):
+ self.contents.append(tag)
+
+ def first(self, name=None, attrs={}, contents=None, recursive=1):
+ r = None
+ l = self.fetch(name, attrs, contents, recursive)
+ if l:
+ r = l[0]
+ return r
+
+ def fetch(self, name=None, attrs={}, contents=None, recursive=1):
+ """Extracts Tag objects that match the given criteria. You
+ can specify the name of the Tag, any attributes you want the
+ Tag to have, and what text and Tags you want to see inside the
+ Tag."""
+ if contents and type(contents) != type([]):
+ contents = [contents]
+ results = []
+ for i in self.contents:
+ if isinstance(i, Tag):
+ if not name or i.name == name:
+ match = 1
+ for attr, value in attrs.items():
+ check = i.get(attr)
+ #By default, find the specific value called for.
+ #Use SQL-style wildcards to find substrings, prefix,
+ #suffix, etc.
+ result = (check == value)
+ if check and value:
+ if len(value) > 1 and value[0] == '%' and value[-1] == '%' and value[-2] != '\\':
+ result = (check.find(value[1:-1]) != -1)
+ elif value[0] == '%':
+ print "blah"
+ result = check.rfind(value[1:]) == len(check)-len(value)+1
+ elif value[-1] == '%':
+ result = check.find(value[:-1]) == 0
+ if not result:
+ match = 0
+ break
+ match = match and (not contents or i.contents == contents)
+ if match:
+ results.append(i)
+ if recursive:
+ results.extend(i.fetch(name, attrs, contents, recursive))
+ return results
+
+class BeautifulSoup(SGMLParser, Tag):
+
+ """The actual parser. It knows the following facts about HTML, and
+ not much else:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * Most tags can't be nested; encountering an open tag when there's
+ already an open tag of that type in the stack means that the
+ previous tag of that type should be implicitly closed. However,
+ some tags can be nested. When a nestable tag is encountered,
+ it's okay to close all unclosed tags up to the last nestable
+ tag. It might not be safe to close any more, so that's all it
+ closes.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always get it and parse it explicitly."""
+
+ SELF_CLOSING_TAGS = ['br', 'hr', 'input', 'img', 'meta', 'spacer',
+ 'link', 'frame']
+ NESTABLE_TAGS = ['font', 'table', 'tr', 'td', 'th', 'tbody', 'p',
+ 'div']
+ QUOTE_TAGS = ['script']
+
+ IMPLICITLY_CLOSE_TAGS = 1
+
+ def __init__(self, text=None):
+ Tag.__init__(self, '[document]')
+ SGMLParser.__init__(self)
+ self.quoteStack = []
+ self.hideTag = 1
+ self.reset()
+ if text:
+ self.feed(text)
+
+ def feed(self, text):
+ SGMLParser.feed(self, text)
+ self.endData()
+
+ def reset(self):
+ SGMLParser.reset(self)
+ self.currentData = ''
+ self.currentTag = None
+ self.tagStack = []
+ self.pushTag(self)
+
+ def popTag(self, closedTagName=None):
+ tag = self.tagStack.pop()
+ if closedTagName == tag.name:
+ tag.foundClose = 1
+
+ # Tags with just one string-owning child get the same string
+ # property as the child, so that soup.tag.string is shorthand
+ # for soup.tag.contents[0].string
+ if len(self.currentTag.contents) == 1 and \
+ hasattr(self.currentTag.contents[0], 'string'):
+ self.currentTag.string = self.currentTag.contents[0].string
+
+ #print "Pop", tag.name
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self):
+ if self.currentData:
+ if not string.strip(self.currentData):
+ if '\n' in self.currentData:
+ self.currentData = '\n'
+ else:
+ self.currentData = ' '
+ o = NavigableText(self.currentData, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+ self.currentData = ''
+
+ def _popToTag(self, name, closedTag=0):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If a list of tags is given, will
+ accept any of those tags as an excuse to stop popping, and will
+ *not* pop the tag that caused it to stop popping."""
+ if self.IMPLICITLY_CLOSE_TAGS:
+ closedTag = 1
+ numPops = 0
+ mostRecentTag = None
+ oneTag = (type(name) == types.StringType)
+ for i in range(len(self.tagStack)-1, 0, -1):
+ thisTag = self.tagStack[i].name
+ if (oneTag and thisTag == name) \
+ or (not oneTag and thisTag in name):
+ numPops = len(self.tagStack)-i
+ break
+ if not oneTag:
+ numPops = numPops - 1
+
+ closedTagName = None
+ if closedTag:
+ closedTagName = name
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag(closedTagName)
+ return mostRecentTag
+
+ def unknown_starttag(self, name, attrs):
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = map(lambda(x, y): '%s="%s"' % (x, y), attrs)
+ self.handle_data('<%s %s>' % (name, attrs))
+ return
+ self.endData()
+ tag = Tag(name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ if not name in self.SELF_CLOSING_TAGS:
+ if name in self.NESTABLE_TAGS:
+ self._popToTag(self.NESTABLE_TAGS)
+ else:
+ self._popToTag(name)
+ self.pushTag(tag)
+ if name in self.SELF_CLOSING_TAGS:
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+
+ def unknown_endtag(self, name):
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "%s> is not real!" % name
+ self.handle_data('%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name, 1)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ #print "That's the end of %s!" % self.quoteStack[-1]
+ self.quoteStack.pop()
+
+ def handle_data(self, data):
+ self.currentData = self.currentData + data
+
+ def handle_comment(self, text):
+ "Propagate comments right through."
+ self.handle_data("" % text)
+
+ def handle_charref(self, ref):
+ "Propagate char refs right through."
+ self.handle_data('%s;' % ref)
+
+ def handle_entityref(self, ref):
+ "Propagate entity refs right through."
+ self.handle_data('&%s;' % ref)
+
+ def handle_decl(self, data):
+ "Propagate DOCTYPEs right through."
+ self.handle_data('' % data)
+
+class BeautifulStoneSoup(BeautifulSoup):
+
+ """A version of BeautifulSoup that doesn't know anything at all
+ about what HTML tags have special behavior. Useful for parsing
+ things that aren't HTML, or when BeautifulSoup makes an assumption
+ counter to what you were expecting."""
+
+ IMPLICITLY_CLOSE_TAGS = 0
+
+ SELF_CLOSING_TAGS = []
+ NESTABLE_TAGS = []
+ QUOTE_TAGS = []
diff --git a/plugins/Debian.py b/plugins/Debian.py
index 80e3cb1b6..f3d15d48f 100644
--- a/plugins/Debian.py
+++ b/plugins/Debian.py
@@ -36,8 +36,6 @@ This is a module to contain Debian-specific commands.
__revision__ = "$Id$"
__author__ = "James Vega (jamessan) "
-import supybot.plugins as plugins
-
import re
import gzip
import sets
@@ -47,13 +45,16 @@ import socket
import urllib
import fnmatch
import os.path
-from itertools import imap, ifilter
-import supybot.registry as registry
+import BeautifulSoup
+
+from itertools import imap, ifilter
import supybot.conf as conf
import supybot.utils as utils
+import supybot.plugins as plugins
import supybot.privmsgs as privmsgs
+import supybot.registry as registry
import supybot.webutils as webutils
import supybot.callbacks as callbacks
@@ -184,7 +185,7 @@ class Debian(callbacks.Privmsg,
_debreflags = re.DOTALL | re.IGNORECASE
_debbrre = re.compile(r']+>(.*?) \(', _debreflags)
- _debverre = re.compile(r'
(?:\d+:)?(\S+):', _debreflags)
+ _debverre = re.compile(r'
((?:\d+:)?\S+):', _debreflags)
_deblistre = re.compile(r'Package ([^<]+)
(.*?)', _debreflags)
_debBranches = ('stable', 'testing', 'unstable', 'experimental')
def version(self, irc, msg, args):
@@ -235,12 +236,22 @@ class Debian(callbacks.Privmsg,
else:
for pkg in pkgs:
pkgMatch = pkg[0]
- brMatch = self._debbrre.findall(pkg[1])
- verMatch = self._debverre.findall(pkg[1])
- if pkgMatch and brMatch and verMatch:
- versions = zip(brMatch, verMatch)
- for version in versions:
- s = '%s (%s)' % (pkgMatch, ': '.join(version))
+ soup = BeautifulSoup.BeautifulSoup()
+ soup.feed(pkg[1])
+ liBranches = soup.fetch('li')
+ branches = []
+ versions = []
+ def branchVers(br):
+ vers = [b.next.string.strip() for b in br]
+ return [rsplit(v, ':', 1)[0] for v in vers]
+ for li in liBranches:
+ branches.append(li.first('a').string)
+ versions.append(branchVers(li.fetch('br')))
+ if branches and versions:
+ for pairs in zip(branches, versions):
+ branch = pairs[0]
+ ver = ', '.join(pairs[1])
+ s = '%s (%s)' % (pkgMatch, ': '.join([branch, ver]))
responses.append(s)
resp = '%s matches found: %s' % \
(len(responses), '; '.join(responses))