diff --git a/plugins/DDG/parser.py b/plugins/DDG/parser.py new file mode 100644 index 000000000..f6488b509 --- /dev/null +++ b/plugins/DDG/parser.py @@ -0,0 +1,87 @@ +import enum +import collections +from html.parser import HTMLParser + +result = collections.namedtuple('result', 'link title snippet') + +@enum.unique +class ParserState(enum.Enum): + OUTSIDE = 0 + IN_TITLE = 1 + TITLE_PARSED = 2 + IN_SNIPPET = 3 + +STACKED_TAGS = ('table', 'tr', 'td', 'a') + +class DDGHTMLParser(HTMLParser): + def __init__(self): + super().__init__() + self.stack = [] + self.results = [] + + self.reset_current_result() + + def reset_current_result(self): + self.state = ParserState.OUTSIDE + self.current_link = None + self.current_title = None + self.current_snippet = None + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + classes = attrs.get('class', '').split() + + if tag in STACKED_TAGS: + self.stack.append((tag, classes)) + + if ('tr', ['result-sponsored']) in self.stack: + # Skip sponsored results + return + + if tag == 'a' and 'result-link' in classes: + assert self.state == ParserState.OUTSIDE, (self.state, self.current_title) + self.state = ParserState.IN_TITLE + self.current_link = attrs['href'] + self.current_title = [] + + elif tag == 'td' and 'result-snippet' in classes: + assert self.state == ParserState.TITLE_PARSED, self.state + self.state = ParserState.IN_SNIPPET + self.current_snippet = [] + + elif tag == 'span' and 'link-text' in classes: + if self.state == ParserState.TITLE_PARSED: + # No snippet + self.state = ParserState.OUTSIDE + self.current_snippet = [] + + def handle_endtag(self, tag): + if tag in STACKED_TAGS: + item = self.stack.pop() + assert item[0] == tag, (item, tag) + + if tag == 'a' and self.state == ParserState.IN_TITLE: + self.state = ParserState.TITLE_PARSED + elif tag == 'td' and self.state == ParserState.IN_SNIPPET: + self.build_result() + self.state = ParserState.OUTSIDE + + def handle_data(self, data): + if self.state == ParserState.IN_TITLE: + self.current_title.append(data) + elif self.state == ParserState.IN_SNIPPET: + self.current_snippet.append(data) + + def build_result(self): + self.results.append(result( + link=self.current_link, + title=''.join(self.current_title), + snippet=''.join(self.current_snippet), + )) + self.reset_current_result() + +if __name__ == '__main__': + parser = DDGHTMLParser() + with open('ddg2.html') as fd: + parser.feed(fd.read()) + print(parser.results) diff --git a/plugins/DDG/plugin.py b/plugins/DDG/plugin.py index 004136efd..a00c3642a 100644 --- a/plugins/DDG/plugin.py +++ b/plugins/DDG/plugin.py @@ -28,6 +28,10 @@ ### +import functools +from html.parser import HTMLParser +from urllib.parse import urlencode, parse_qs + import supybot.utils as utils from supybot.commands import * import supybot.plugins as plugins @@ -42,18 +46,7 @@ except ImportError: # without the i18n module _ = lambda x: x - -try: # Python 3 - from urllib.parse import urlencode, parse_qs -except ImportError: # Python 2 - from urllib import urlencode - from urlparse import parse_qs -try: - from bs4 import BeautifulSoup -except ImportError: - raise ImportError("Beautiful Soup 4 is required for this plugin: get it" - " at http://www.crummy.com/software/BeautifulSoup/bs4" - "/doc/#installing-beautiful-soup") +from .parser import DDGHTMLParser class DDG(callbacks.Plugin): @@ -70,12 +63,11 @@ class DDG(callbacks.Plugin): real_url, data = utils.web.getUrlTargetAndContent(url) data = data.decode("utf-8") - soup = BeautifulSoup(data) + parser = DDGHTMLParser() + parser.feed(data) # Remove "sponsored link" results - return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in - str(td.parent.get('class'))]) - + return (url, real_url, parser.results) def search_core(self, text, channel_context=None, max_results=None, show_snippet=None): """ @@ -103,47 +95,31 @@ class DDG(callbacks.Plugin): return [('', '', real_url)] for t in raw_results: - res = '' - # Each valid result has a preceding heading in the format - # '