DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup.

So we don't depend on an external library.
2026-01-02 16:37:58 +01:00 · 2020-05-15 21:26:44 +02:00 · 2020-05-15 21:26:44 +02:00 · 10df0a0dd0
commit 10df0a0dd0
parent 37c1fa2153
2 changed files with 117 additions and 54 deletions
--- a/plugins/DDG/parser.py
+++ b/plugins/DDG/parser.py
@ -0,0 +1,87 @@
+import enum
+import collections
+from html.parser import HTMLParser
+
+result = collections.namedtuple('result', 'link title snippet')
+
+@enum.unique
+class ParserState(enum.Enum):
+    OUTSIDE = 0
+    IN_TITLE = 1
+    TITLE_PARSED = 2
+    IN_SNIPPET = 3
+
+STACKED_TAGS = ('table', 'tr', 'td', 'a')
+
+class DDGHTMLParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.stack = []
+        self.results = []
+
+        self.reset_current_result()
+
+    def reset_current_result(self):
+        self.state = ParserState.OUTSIDE
+        self.current_link = None
+        self.current_title = None
+        self.current_snippet = None
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        classes = attrs.get('class', '').split()
+
+        if tag in STACKED_TAGS:
+            self.stack.append((tag, classes))
+
+        if ('tr', ['result-sponsored']) in self.stack:
+            # Skip sponsored results
+            return
+
+        if tag == 'a' and 'result-link' in classes:
+            assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
+            self.state = ParserState.IN_TITLE
+            self.current_link = attrs['href']
+            self.current_title = []
+
+        elif tag == 'td' and 'result-snippet' in classes:
+            assert self.state == ParserState.TITLE_PARSED, self.state
+            self.state = ParserState.IN_SNIPPET
+            self.current_snippet = []
+
+        elif tag == 'span' and 'link-text' in classes:
+            if self.state == ParserState.TITLE_PARSED:
+                # No snippet
+                self.state = ParserState.OUTSIDE
+                self.current_snippet = []
+
+    def handle_endtag(self, tag):
+        if tag in STACKED_TAGS:
+            item = self.stack.pop()
+            assert item[0] == tag, (item, tag)
+
+        if tag == 'a' and self.state == ParserState.IN_TITLE:
+            self.state = ParserState.TITLE_PARSED
+        elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
+            self.build_result()
+            self.state = ParserState.OUTSIDE
+
+    def handle_data(self, data):
+        if self.state == ParserState.IN_TITLE:
+            self.current_title.append(data)
+        elif self.state == ParserState.IN_SNIPPET:
+            self.current_snippet.append(data)
+
+    def build_result(self):
+        self.results.append(result(
+            link=self.current_link,
+            title=''.join(self.current_title),
+            snippet=''.join(self.current_snippet),
+        ))
+        self.reset_current_result()
+
+if __name__ == '__main__':
+    parser = DDGHTMLParser()
+    with open('ddg2.html') as fd:
+        parser.feed(fd.read())
+    print(parser.results)
--- a/plugins/DDG/plugin.py
+++ b/plugins/DDG/plugin.py
@ -28,6 +28,10 @@

 ###

+import functools
+from html.parser import HTMLParser
+from urllib.parse import urlencode, parse_qs
+
 import supybot.utils as utils
 from supybot.commands import *
 import supybot.plugins as plugins
@ -42,18 +46,7 @@ except ImportError:
    # without the i18n module
    _ = lambda x: x

-
-try:  # Python 3
-    from urllib.parse import urlencode, parse_qs
-except ImportError:  # Python 2
-    from urllib import urlencode
-    from urlparse import parse_qs
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    raise ImportError("Beautiful Soup 4 is required for this plugin: get it"
-                      " at http://www.crummy.com/software/BeautifulSoup/bs4"
-                      "/doc/#installing-beautiful-soup")
+from .parser import DDGHTMLParser


 class DDG(callbacks.Plugin):
@ -70,12 +63,11 @@ class DDG(callbacks.Plugin):

        real_url, data = utils.web.getUrlTargetAndContent(url)
        data = data.decode("utf-8")
-        soup = BeautifulSoup(data)
+        parser = DDGHTMLParser()
+        parser.feed(data)

        # Remove "sponsored link" results
-        return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in 
-                                str(td.parent.get('class'))])
-
+        return (url, real_url, parser.results)

    def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
        """
@ -103,47 +95,31 @@ class DDG(callbacks.Plugin):
            return [('', '', real_url)]

        for t in raw_results:
-            res = ''
-            # Each valid result has a preceding heading in the format
-            # '<td valign="top">1.&nbsp;</td>', etc.
-            if t.text[0].isdigit():
-                res = t.next_sibling.next_sibling
-            if not res:
-                continue
-            try:
+            if self.registryValue("showsnippet", channel_context):
+                snippet = t.snippet.strip()
+            else:
                snippet = ''
-                # 1) Get a result snippet.
+            title = t.title.strip()
+            origlink = link = t.link

-                if self.registryValue("showsnippet", channel_context):
-                    snippet = res.parent.next_sibling.next_sibling.\
-                        find_all("td")[-1]
-                    snippet = snippet.text.strip()
-                # 2) Fetch the link title.
-                title = res.a.text.strip()
-                # 3) Fetch the result link.
-                origlink = link = res.a.get('href')
+            # As of 2017-01-20, some links on DuckDuckGo's site are shown going through
+            # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
+            # instead of simply being "https://duckduckgo.com". So, we decode these links here.
+            if link.startswith('/l/'):
+                linkparse = utils.web.urlparse(link)
+                try:
+                    link = parse_qs(linkparse.query)['uddg'][0]
+                except KeyError:
+                    # No link was given here, skip.
+                    continue
+                except IndexError:
+                    self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
+                    continue
+                else:
+                    self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)

-                # As of 2017-01-20, some links on DuckDuckGo's site are shown going through
-                # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
-                # instead of simply being "https://duckduckgo.com". So, we decode these links here.
-                if link.startswith('/l/'):
-                    linkparse = utils.web.urlparse(link)
-                    try:
-                        link = parse_qs(linkparse.query)['uddg'][0]
-                    except KeyError:
-                        # No link was given here, skip.
-                        continue
-                    except IndexError:
-                        self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
-                        continue
-                    else:
-                        self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)
-
-                # Return a list of tuples in the form (link title, snippet text, link)
-                results.append((title, snippet, link))
-
-            except AttributeError:
-                continue
+            # Return a list of tuples in the form (link title, snippet text, link)
+            results.append((title, snippet, link))
        return results[:maxr]

    @wrap(['text'])