diff --git a/others/BeautifulSoup.py b/others/BeautifulSoup.py new file mode 100644 index 000000000..4fb3550fe --- /dev/null +++ b/others/BeautifulSoup.py @@ -0,0 +1,449 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" + +The BeautifulSoup class turns arbitrarily bad HTML into a tree-like +nested tag-soup list of Tag objects and text snippets. A Tag object +corresponds to an HTML tag. It knows about the HTML tag's attributes, +and contains a representation of everything contained between the +original tag and its closing tag (if any). It's easy to extract Tags +that meet certain criteria. + +A well-formed HTML document will yield a well-formed data +structure. An ill-formed HTML document will yield a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this to process the well-formed part of it. + +#Example: +#-------- +from BeautifulSoup import BeautifulSoup +text = ''' +The Title + +Link text (italicized) +Link text 2 + +''' +soup = BeautifulSoup() +soup.feed(text) +print soup("a") #Returns a list of 2 Tag objects, one for each link in + #the source +print soup.first("a", {'class':'foo'})['href'] #Returns http://www.crummy.com/ +print soup.first("title").contents[0] #Returns "The title" +print soup.first("a", {'href':'http://www.crummy.com/'}).first("i").contents[0] +#Returns "text (italicized)" + +#Example of SQL-style attribute wildcards -- all four 'find' calls will +#find the link. +#---------------------------------------------------------------------- +soup = BeautifulSoup() +soup.feed('''bla''') +print soup.fetch('a', {'href': 'http://foo.com/'}) +print soup.fetch('a', {'href': 'http://%'}) +print soup.fetch('a', {'href': '%.com/'}) +print soup.fetch('a', {'href': '%o.c%'}) + +#Example with horrible HTML: +#--------------------------- +soup = BeautifulSoup() +soup.feed(''' +Go here +or go Home +''') +print soup.fetch('a') #Returns a list of 2 Tag objects. +print soup.first(attrs={'href': 'here.html'})['class'] #Returns "that" +print soup.first(attrs={'class': 'that'}).first('i').contents[0] #returns "here" + +This library has no external dependencies. It works with Python 1.5.2 +and up. If you can install a Python extension, you might want to use +the ElementTree Tidy HTML Tree Builder instead: + http://www.effbot.org/zone/element-tidylib.htm + +You can use BeautifulSoup on any SGML-like substance, such as XML or a +domain-specific language that looks like HTML but has different tag +names. For such purposes you may want to use the BeautifulStoneSoup +class, which knows nothing at all about HTML per se. I also reserve +the right to make the BeautifulSoup parser smarter between releases, +so if you want forwards-compatibility without having to think about +it, you might want to go with BeautifulStoneSoup. + +Release status: + +(I do a new release whenever I make a change that breaks backwards +compatibility.) + +Current release: + + Applied patch from Richie Hindle (richie at entrian dot com) that + makes tag.string a shorthand for tag.contents[0].string when the tag + has only one string-owning child. + +1.2 "Who for such dainties would not stoop?" (2004/07/08): Applied + patch from Ben Last (ben at benlast dot com) that made + Tag.renderContents() correctly handle Unicode. + + Made BeautifulStoneSoup even dumber by making it not implicitly + close a tag when another tag of the same type is encountered; only + when an actual closing tag is encountered. This change courtesy of + Fuzzy (mike at pcblokes dot com). BeautifulSoup still works as + before. + +1.1 "Swimming in a hot tureen": Added more 'nestable' tags. Changed + popping semantics so that when a nestable tag is encountered, tags are + popped up to the previously encountered nestable tag (of whatever kind). + I will revert this if enough people complain, but it should make + more people's lives easier than harder. + + This enhancement was suggested by Anthony Baxter (anthony at + interlink dot com dot au). + +1.0 "So rich and green": Initial release. + +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "1.1 $Revision$" +__date__ = "$Date$" +__copyright__ = "Copyright (c) 2004 Leonard Richardson" +__license__ = "Python" + +from sgmllib import SGMLParser +import string +import types + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def __init__(self, parent=None, previous=None): + self.parent = parent + self.previous = previous + self.next = None + +class NavigableText(PageElement): + + """A simple wrapper around a string that keeps track of where in + the document the string was found. Doesn't implement all the + string methods because I'm lazy. You could have this extend + UserString if you were using 2.2.""" + + def __init__(self, string, parent=None, previous=None): + PageElement.__init__(self, parent, previous) + self.string = string + + def __eq__(self, other): + return self.string == str(other) + + def __str__(self): + return self.string + + def strip(self): + return self.string.strip() + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs={}, parent=None, previous=None): + PageElement.__init__(self, parent, previous) + self.name = name + self.attrs = attrs + self.contents = [] + self.foundClose = 0 + + def get(self, key, default=None): + return self._getAttrMap().get(key, default) + + def __call__(self, *args): + return apply(self.fetch, args) + + def __getitem__(self, key): + return self._getAttrMap()[key] + + def __setitem__(self, key, value): + self._getAttrMap() + self.attrMap[key] = value + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + + def _getAttrMap(self): + if not hasattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + def __repr__(self): + return str(self) + + def __ne__(self, other): + return not self == other + + def __eq__(self, other): + if not isinstance(other, Tag) or self.name != other.name or self.attrs != other.attrs or len(self.contents) != len(other.contents): + return 0 + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return 0 + return 1 + + def __str__(self): + attrs = '' + if self.attrs: + for key, val in self.attrs: + attrs = attrs + ' %s="%s"' % (key, val) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + elif self.foundClose: + closeTag = '' % self.name + s = self.renderContents() + if not hasattr(self, 'hideTag'): + s = '<%s%s%s>' % (self.name, attrs, close) + s + closeTag + return s + + def renderContents(self): + s='' #non-Unicode + for c in self.contents: + try: + s = s + str(c) + except UnicodeEncodeError: + if type(s) <> types.UnicodeType: + s = s.decode('utf8') #convert ascii to Unicode + #str() should, strictly speaking, not return a Unicode + #string, but NavigableText never checks and will return + #Unicode data if it was initialised with it. + s = s + str(c) + return s + + def isSelfClosing(self): + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + self.contents.append(tag) + + def first(self, name=None, attrs={}, contents=None, recursive=1): + r = None + l = self.fetch(name, attrs, contents, recursive) + if l: + r = l[0] + return r + + def fetch(self, name=None, attrs={}, contents=None, recursive=1): + """Extracts Tag objects that match the given criteria. You + can specify the name of the Tag, any attributes you want the + Tag to have, and what text and Tags you want to see inside the + Tag.""" + if contents and type(contents) != type([]): + contents = [contents] + results = [] + for i in self.contents: + if isinstance(i, Tag): + if not name or i.name == name: + match = 1 + for attr, value in attrs.items(): + check = i.get(attr) + #By default, find the specific value called for. + #Use SQL-style wildcards to find substrings, prefix, + #suffix, etc. + result = (check == value) + if check and value: + if len(value) > 1 and value[0] == '%' and value[-1] == '%' and value[-2] != '\\': + result = (check.find(value[1:-1]) != -1) + elif value[0] == '%': + print "blah" + result = check.rfind(value[1:]) == len(check)-len(value)+1 + elif value[-1] == '%': + result = check.find(value[:-1]) == 0 + if not result: + match = 0 + break + match = match and (not contents or i.contents == contents) + if match: + results.append(i) + if recursive: + results.extend(i.fetch(name, attrs, contents, recursive)) + return results + +class BeautifulSoup(SGMLParser, Tag): + + """The actual parser. It knows the following facts about HTML, and + not much else: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * Most tags can't be nested; encountering an open tag when there's + already an open tag of that type in the stack means that the + previous tag of that type should be implicitly closed. However, + some tags can be nested. When a nestable tag is encountered, + it's okay to close all unclosed tags up to the last nestable + tag. It might not be safe to close any more, so that's all it + closes. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always get it and parse it explicitly.""" + + SELF_CLOSING_TAGS = ['br', 'hr', 'input', 'img', 'meta', 'spacer', + 'link', 'frame'] + NESTABLE_TAGS = ['font', 'table', 'tr', 'td', 'th', 'tbody', 'p', + 'div'] + QUOTE_TAGS = ['script'] + + IMPLICITLY_CLOSE_TAGS = 1 + + def __init__(self, text=None): + Tag.__init__(self, '[document]') + SGMLParser.__init__(self) + self.quoteStack = [] + self.hideTag = 1 + self.reset() + if text: + self.feed(text) + + def feed(self, text): + SGMLParser.feed(self, text) + self.endData() + + def reset(self): + SGMLParser.reset(self) + self.currentData = '' + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self, closedTagName=None): + tag = self.tagStack.pop() + if closedTagName == tag.name: + tag.foundClose = 1 + + # Tags with just one string-owning child get the same string + # property as the child, so that soup.tag.string is shorthand + # for soup.tag.contents[0].string + if len(self.currentTag.contents) == 1 and \ + hasattr(self.currentTag.contents[0], 'string'): + self.currentTag.string = self.currentTag.contents[0].string + + #print "Pop", tag.name + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + if self.currentData: + if not string.strip(self.currentData): + if '\n' in self.currentData: + self.currentData = '\n' + else: + self.currentData = ' ' + o = NavigableText(self.currentData, self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = '' + + def _popToTag(self, name, closedTag=0): + """Pops the tag stack up to and including the most recent + instance of the given tag. If a list of tags is given, will + accept any of those tags as an excuse to stop popping, and will + *not* pop the tag that caused it to stop popping.""" + if self.IMPLICITLY_CLOSE_TAGS: + closedTag = 1 + numPops = 0 + mostRecentTag = None + oneTag = (type(name) == types.StringType) + for i in range(len(self.tagStack)-1, 0, -1): + thisTag = self.tagStack[i].name + if (oneTag and thisTag == name) \ + or (not oneTag and thisTag in name): + numPops = len(self.tagStack)-i + break + if not oneTag: + numPops = numPops - 1 + + closedTagName = None + if closedTag: + closedTagName = name + + for i in range(0, numPops): + mostRecentTag = self.popTag(closedTagName) + return mostRecentTag + + def unknown_starttag(self, name, attrs): + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = map(lambda(x, y): '%s="%s"' % (x, y), attrs) + self.handle_data('<%s %s>' % (name, attrs)) + return + self.endData() + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + if not name in self.SELF_CLOSING_TAGS: + if name in self.NESTABLE_TAGS: + self._popToTag(self.NESTABLE_TAGS) + else: + self._popToTag(name) + self.pushTag(tag) + if name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name, 1) + if self.quoteStack and self.quoteStack[-1] == name: + #print "That's the end of %s!" % self.quoteStack[-1] + self.quoteStack.pop() + + def handle_data(self, data): + self.currentData = self.currentData + data + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs right through." + self.handle_data('' % data) + +class BeautifulStoneSoup(BeautifulSoup): + + """A version of BeautifulSoup that doesn't know anything at all + about what HTML tags have special behavior. Useful for parsing + things that aren't HTML, or when BeautifulSoup makes an assumption + counter to what you were expecting.""" + + IMPLICITLY_CLOSE_TAGS = 0 + + SELF_CLOSING_TAGS = [] + NESTABLE_TAGS = [] + QUOTE_TAGS = [] diff --git a/plugins/Debian.py b/plugins/Debian.py index 80e3cb1b6..f3d15d48f 100644 --- a/plugins/Debian.py +++ b/plugins/Debian.py @@ -36,8 +36,6 @@ This is a module to contain Debian-specific commands. __revision__ = "$Id$" __author__ = "James Vega (jamessan) " -import supybot.plugins as plugins - import re import gzip import sets @@ -47,13 +45,16 @@ import socket import urllib import fnmatch import os.path -from itertools import imap, ifilter -import supybot.registry as registry +import BeautifulSoup + +from itertools import imap, ifilter import supybot.conf as conf import supybot.utils as utils +import supybot.plugins as plugins import supybot.privmsgs as privmsgs +import supybot.registry as registry import supybot.webutils as webutils import supybot.callbacks as callbacks @@ -184,7 +185,7 @@ class Debian(callbacks.Privmsg, _debreflags = re.DOTALL | re.IGNORECASE _debbrre = re.compile(r'
  • ]+>(.*?) \(', _debreflags) - _debverre = re.compile(r'
    (?:\d+:)?(\S+):', _debreflags) + _debverre = re.compile(r'
    ((?:\d+:)?\S+):', _debreflags) _deblistre = re.compile(r'

    Package ([^<]+)

    (.*?)', _debreflags) _debBranches = ('stable', 'testing', 'unstable', 'experimental') def version(self, irc, msg, args): @@ -235,12 +236,22 @@ class Debian(callbacks.Privmsg, else: for pkg in pkgs: pkgMatch = pkg[0] - brMatch = self._debbrre.findall(pkg[1]) - verMatch = self._debverre.findall(pkg[1]) - if pkgMatch and brMatch and verMatch: - versions = zip(brMatch, verMatch) - for version in versions: - s = '%s (%s)' % (pkgMatch, ': '.join(version)) + soup = BeautifulSoup.BeautifulSoup() + soup.feed(pkg[1]) + liBranches = soup.fetch('li') + branches = [] + versions = [] + def branchVers(br): + vers = [b.next.string.strip() for b in br] + return [rsplit(v, ':', 1)[0] for v in vers] + for li in liBranches: + branches.append(li.first('a').string) + versions.append(branchVers(li.fetch('br'))) + if branches and versions: + for pairs in zip(branches, versions): + branch = pairs[0] + ver = ', '.join(pairs[1]) + s = '%s (%s)' % (pkgMatch, ': '.join([branch, ver])) responses.append(s) resp = '%s matches found: %s' % \ (len(responses), '; '.join(responses))