DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup.

So we don't depend on an external library.
2025-07-05 21:17:25 +02:00 · 2020-05-15 21:26:44 +02:00 · 2020-05-15 21:26:44 +02:00 · 10df0a0dd0
commit 10df0a0dd0
parent 37c1fa2153
2 changed files with 117 additions and 54 deletions
--- a/plugins/DDG/parser.py
+++ b/plugins/DDG/parser.py
@ -0,0 +1,87 @@
 import enum
 import collections
 from html.parser import HTMLParser
 result = collections.namedtuple('result', 'link title snippet')
@enum.unique
 class ParserState(enum.Enum):
    OUTSIDE = 0
    IN_TITLE = 1
    TITLE_PARSED = 2
    IN_SNIPPET = 3
 STACKED_TAGS = ('table', 'tr', 'td', 'a')
 class DDGHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.stack = []
        self.results = []
        self.reset_current_result()
    def reset_current_result(self):
        self.state = ParserState.OUTSIDE
        self.current_link = None
        self.current_title = None
        self.current_snippet = None
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        classes = attrs.get('class', '').split()
        if tag in STACKED_TAGS:
            self.stack.append((tag, classes))
        if ('tr', ['result-sponsored']) in self.stack:
            # Skip sponsored results
            return
        if tag == 'a' and 'result-link' in classes:
            assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
            self.state = ParserState.IN_TITLE
            self.current_link = attrs['href']
            self.current_title = []
        elif tag == 'td' and 'result-snippet' in classes:
            assert self.state == ParserState.TITLE_PARSED, self.state
            self.state = ParserState.IN_SNIPPET
            self.current_snippet = []
        elif tag == 'span' and 'link-text' in classes:
            if self.state == ParserState.TITLE_PARSED:
                # No snippet
                self.state = ParserState.OUTSIDE
                self.current_snippet = []
    def handle_endtag(self, tag):
        if tag in STACKED_TAGS:
            item = self.stack.pop()
            assert item[0] == tag, (item, tag)
        if tag == 'a' and self.state == ParserState.IN_TITLE:
            self.state = ParserState.TITLE_PARSED
        elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
            self.build_result()
            self.state = ParserState.OUTSIDE
    def handle_data(self, data):
        if self.state == ParserState.IN_TITLE:
            self.current_title.append(data)
        elif self.state == ParserState.IN_SNIPPET:
            self.current_snippet.append(data)
    def build_result(self):
        self.results.append(result(
            link=self.current_link,
            title=''.join(self.current_title),
            snippet=''.join(self.current_snippet),
        ))
        self.reset_current_result()
 if __name__ == '__main__':
    parser = DDGHTMLParser()
    with open('ddg2.html') as fd:
        parser.feed(fd.read())
    print(parser.results)
--- a/plugins/DDG/plugin.py
+++ b/plugins/DDG/plugin.py
@ -28,6 +28,10 @@
 ###
 import functools
 from html.parser import HTMLParser
 from urllib.parse import urlencode, parse_qs
 import supybot.utils as utils
 from supybot.commands import *
 import supybot.plugins as plugins
@ -42,18 +46,7 @@ except ImportError:
    # without the i18n module
    _ = lambda x: x
-
+from .parser import DDGHTMLParser
 try:  # Python 3
    from urllib.parse import urlencode, parse_qs
 except ImportError:  # Python 2
    from urllib import urlencode
    from urlparse import parse_qs
 try:
    from bs4 import BeautifulSoup
 except ImportError:
    raise ImportError("Beautiful Soup 4 is required for this plugin: get it"
                      " at http://www.crummy.com/software/BeautifulSoup/bs4"
                      "/doc/#installing-beautiful-soup")
 class DDG(callbacks.Plugin):
@ -70,12 +63,11 @@ class DDG(callbacks.Plugin):
        real_url, data = utils.web.getUrlTargetAndContent(url)
        data = data.decode("utf-8")
-        soup = BeautifulSoup(data)
+        parser = DDGHTMLParser()
        parser.feed(data)
        # Remove "sponsored link" results
-        return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in 
+        return (url, real_url, parser.results)
                                str(td.parent.get('class'))])
    def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
        """
@ -103,25 +95,12 @@ class DDG(callbacks.Plugin):
            return [('', '', real_url)]
        for t in raw_results:
            res = ''
            # Each valid result has a preceding heading in the format
            # '<td valign="top">1.&nbsp;</td>', etc.
            if t.text[0].isdigit():
                res = t.next_sibling.next_sibling
            if not res:
                continue
            try:
                snippet = ''
                # 1) Get a result snippet.
            if self.registryValue("showsnippet", channel_context):
-                    snippet = res.parent.next_sibling.next_sibling.\
+                snippet = t.snippet.strip()
-                        find_all("td")[-1]
+            else:
-                    snippet = snippet.text.strip()
+                snippet = ''
-                # 2) Fetch the link title.
+            title = t.title.strip()
-                title = res.a.text.strip()
+            origlink = link = t.link
                # 3) Fetch the result link.
                origlink = link = res.a.get('href')
            # As of 2017-01-20, some links on DuckDuckGo's site are shown going through
            # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
@ -141,9 +120,6 @@ class DDG(callbacks.Plugin):
            # Return a list of tuples in the form (link title, snippet text, link)
            results.append((title, snippet, link))
            except AttributeError:
                continue
        return results[:maxr]
    @wrap(['text'])