From 10df0a0dd07ed6105c5ecd790fff5375a9b75b08 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Fri, 15 May 2020 21:26:44 +0200 Subject: [PATCH] DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. --- plugins/DDG/parser.py | 87 +++++++++++++++++++++++++++++++++++++++++++ plugins/DDG/plugin.py | 84 +++++++++++++++-------------------------- 2 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 plugins/DDG/parser.py diff --git a/plugins/DDG/parser.py b/plugins/DDG/parser.py new file mode 100644 index 000000000..f6488b509 --- /dev/null +++ b/plugins/DDG/parser.py @@ -0,0 +1,87 @@ +import enum +import collections +from html.parser import HTMLParser + +result = collections.namedtuple('result', 'link title snippet') + +@enum.unique +class ParserState(enum.Enum): + OUTSIDE = 0 + IN_TITLE = 1 + TITLE_PARSED = 2 + IN_SNIPPET = 3 + +STACKED_TAGS = ('table', 'tr', 'td', 'a') + +class DDGHTMLParser(HTMLParser): + def __init__(self): + super().__init__() + self.stack = [] + self.results = [] + + self.reset_current_result() + + def reset_current_result(self): + self.state = ParserState.OUTSIDE + self.current_link = None + self.current_title = None + self.current_snippet = None + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + classes = attrs.get('class', '').split() + + if tag in STACKED_TAGS: + self.stack.append((tag, classes)) + + if ('tr', ['result-sponsored']) in self.stack: + # Skip sponsored results + return + + if tag == 'a' and 'result-link' in classes: + assert self.state == ParserState.OUTSIDE, (self.state, self.current_title) + self.state = ParserState.IN_TITLE + self.current_link = attrs['href'] + self.current_title = [] + + elif tag == 'td' and 'result-snippet' in classes: + assert self.state == ParserState.TITLE_PARSED, self.state + self.state = ParserState.IN_SNIPPET + self.current_snippet = [] + + elif tag == 'span' and 'link-text' in classes: + if self.state == ParserState.TITLE_PARSED: + # No snippet + self.state = ParserState.OUTSIDE + self.current_snippet = [] + + def handle_endtag(self, tag): + if tag in STACKED_TAGS: + item = self.stack.pop() + assert item[0] == tag, (item, tag) + + if tag == 'a' and self.state == ParserState.IN_TITLE: + self.state = ParserState.TITLE_PARSED + elif tag == 'td' and self.state == ParserState.IN_SNIPPET: + self.build_result() + self.state = ParserState.OUTSIDE + + def handle_data(self, data): + if self.state == ParserState.IN_TITLE: + self.current_title.append(data) + elif self.state == ParserState.IN_SNIPPET: + self.current_snippet.append(data) + + def build_result(self): + self.results.append(result( + link=self.current_link, + title=''.join(self.current_title), + snippet=''.join(self.current_snippet), + )) + self.reset_current_result() + +if __name__ == '__main__': + parser = DDGHTMLParser() + with open('ddg2.html') as fd: + parser.feed(fd.read()) + print(parser.results) diff --git a/plugins/DDG/plugin.py b/plugins/DDG/plugin.py index 004136efd..a00c3642a 100644 --- a/plugins/DDG/plugin.py +++ b/plugins/DDG/plugin.py @@ -28,6 +28,10 @@ ### +import functools +from html.parser import HTMLParser +from urllib.parse import urlencode, parse_qs + import supybot.utils as utils from supybot.commands import * import supybot.plugins as plugins @@ -42,18 +46,7 @@ except ImportError: # without the i18n module _ = lambda x: x - -try: # Python 3 - from urllib.parse import urlencode, parse_qs -except ImportError: # Python 2 - from urllib import urlencode - from urlparse import parse_qs -try: - from bs4 import BeautifulSoup -except ImportError: - raise ImportError("Beautiful Soup 4 is required for this plugin: get it" - " at http://www.crummy.com/software/BeautifulSoup/bs4" - "/doc/#installing-beautiful-soup") +from .parser import DDGHTMLParser class DDG(callbacks.Plugin): @@ -70,12 +63,11 @@ class DDG(callbacks.Plugin): real_url, data = utils.web.getUrlTargetAndContent(url) data = data.decode("utf-8") - soup = BeautifulSoup(data) + parser = DDGHTMLParser() + parser.feed(data) # Remove "sponsored link" results - return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in - str(td.parent.get('class'))]) - + return (url, real_url, parser.results) def search_core(self, text, channel_context=None, max_results=None, show_snippet=None): """ @@ -103,47 +95,31 @@ class DDG(callbacks.Plugin): return [('', '', real_url)] for t in raw_results: - res = '' - # Each valid result has a preceding heading in the format - # '1. ', etc. - if t.text[0].isdigit(): - res = t.next_sibling.next_sibling - if not res: - continue - try: + if self.registryValue("showsnippet", channel_context): + snippet = t.snippet.strip() + else: snippet = '' - # 1) Get a result snippet. + title = t.title.strip() + origlink = link = t.link - if self.registryValue("showsnippet", channel_context): - snippet = res.parent.next_sibling.next_sibling.\ - find_all("td")[-1] - snippet = snippet.text.strip() - # 2) Fetch the link title. - title = res.a.text.strip() - # 3) Fetch the result link. - origlink = link = res.a.get('href') + # As of 2017-01-20, some links on DuckDuckGo's site are shown going through + # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F" + # instead of simply being "https://duckduckgo.com". So, we decode these links here. + if link.startswith('/l/'): + linkparse = utils.web.urlparse(link) + try: + link = parse_qs(linkparse.query)['uddg'][0] + except KeyError: + # No link was given here, skip. + continue + except IndexError: + self.log.exception("DDG: failed to expand redirected result URL %s", origlink) + continue + else: + self.log.debug("DDG: expanded result URL from %s to %s", origlink, link) - # As of 2017-01-20, some links on DuckDuckGo's site are shown going through - # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F" - # instead of simply being "https://duckduckgo.com". So, we decode these links here. - if link.startswith('/l/'): - linkparse = utils.web.urlparse(link) - try: - link = parse_qs(linkparse.query)['uddg'][0] - except KeyError: - # No link was given here, skip. - continue - except IndexError: - self.log.exception("DDG: failed to expand redirected result URL %s", origlink) - continue - else: - self.log.debug("DDG: expanded result URL from %s to %s", origlink, link) - - # Return a list of tuples in the form (link title, snippet text, link) - results.append((title, snippet, link)) - - except AttributeError: - continue + # Return a list of tuples in the form (link title, snippet text, link) + results.append((title, snippet, link)) return results[:maxr] @wrap(['text'])