From 2924845de417b4b56b5459df5f981a48ffaffc16 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <progval+git@progval.net>
Date: Sun, 24 May 2020 18:57:27 +0200
Subject: [PATCH] Google: Update, with a proper HTML parser.

Hopefully it will be more robust than the regexp.
---
 plugins/Google/parser.py | 126 +++++++++++++++++++++++++++++++++++++++
 plugins/Google/plugin.py |  22 +++----
 plugins/Google/test.py   |   2 +-
 3 files changed, 136 insertions(+), 14 deletions(-)
 create mode 100644 plugins/Google/parser.py

diff --git a/plugins/Google/parser.py b/plugins/Google/parser.py
new file mode 100644
index 000000000..45fc35ced
--- /dev/null
+++ b/plugins/Google/parser.py
@@ -0,0 +1,126 @@
+###
+# Copyright (c) 2020, Valentin Lorentz
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#   * Redistributions of source code must retain the above copyright notice,
+#     this list of conditions, and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions, and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   * Neither the name of the author of this software nor the name of
+#     contributors to this software may be used to endorse or promote products
+#     derived from this software without specific prior written consent.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+###
+
+import enum
+import collections
+from html.parser import HTMLParser
+
+import supybot.utils as utils
+
+result = collections.namedtuple('result', 'link title snippet')
+
+@enum.unique
+class ParserState(enum.Enum):
+    OUTSIDE = 0
+    IN_LINK = 1
+    IN_TITLE = 2
+    TITLE_PARSED = 3
+    BREADCRUMBS_PARSED = 5
+    LINK_PARSED = 6
+
+@enum.unique
+class DomMark(enum.Enum):
+    """A mark on an element in the stack, to know when to change state when
+    poping the element from the stack."""
+    HEADING = 1
+    BREADCRUMBS = 2
+
+STACKED_TAGS = ('div', 'span', 'a')
+
+class GoogleHTMLParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.stack = []
+        self.results = []
+
+        self.reset_current_result()
+
+    def reset_current_result(self):
+        self.state = ParserState.OUTSIDE
+        self.current_link = None
+        self.current_title = None
+        self.current_snippet = None
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        classes = attrs.get('class', '').split()
+
+        if tag in STACKED_TAGS:
+            self.stack.append(tag)
+
+        if tag == 'a' and attrs['href'].startswith('/url?q='):
+            assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
+            self.state = ParserState.IN_LINK
+            href = attrs['href'][len('/url?q='):]
+            self.current_link = utils.web.urlunquote(utils.web.htmlToText(href.split('&sa')[0]))
+
+        elif tag == 'div' and 'a' in self.stack and attrs.get('role') == 'heading' \
+                and self.state == ParserState.IN_LINK:
+            self.state = ParserState.IN_TITLE
+            mark = DomMark.HEADING
+
+    def handle_endtag(self, tag):
+        if tag in STACKED_TAGS:
+            item = self.stack.pop()
+            assert item == tag, (item, tag)
+
+        if tag == 'a' and self.state in (
+                ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED):
+            if self.current_title is None:
+                # That wasn't a result
+                self.state = ParserState.OUTSIDE
+            else:
+                self.state = ParserState.LINK_PARSED
+
+    def handle_data(self, data):
+        if self.state == ParserState.IN_TITLE:
+            self.current_title = data
+            self.state = ParserState.TITLE_PARSED
+        elif self.state == ParserState.TITLE_PARSED:
+            self.state = ParserState.BREADCRUMBS_PARSED
+        elif self.state == ParserState.LINK_PARSED:
+            self.current_snippet = data
+            self.state = ParserState.OUTSIDE
+            self.build_result()
+
+    def build_result(self):
+        self.results.append(result(
+            link=self.current_link,
+            title=self.current_title,
+            snippet=self.current_snippet,
+        ))
+        self.reset_current_result()
+
+if __name__ == '__main__':
+    parser = GoogleHTMLParser()
+    with open('google.html') as fd:
+        parser.feed(fd.read())
+    print(parser.results)
+
diff --git a/plugins/Google/plugin.py b/plugins/Google/plugin.py
index d82a3cdbb..4f74a5d6c 100644
--- a/plugins/Google/plugin.py
+++ b/plugins/Google/plugin.py
@@ -43,6 +43,8 @@ import supybot.callbacks as callbacks
 from supybot.i18n import PluginInternationalization, internationalizeDocstring
 _ = PluginInternationalization('Google')
 
+from .parser import GoogleHTMLParser
+
 class Google(callbacks.PluginRegexp):
     """This is a simple plugin to provide access to the Google services we
     all know and love from our favorite IRC bot."""
@@ -75,16 +77,11 @@ class Google(callbacks.PluginRegexp):
             msg = ircmsgs.privmsg(msg.args[0], s, msg=msg)
         return msg
 
-    _decode_re = re.compile(r'<div class="\w+"><a href="/url\?q=(?P<url>[^"]+)&[^"]+"[^>]*><div class="(\w| )+">(?P<title>.*?)</div><div class="(\w| )+">(?P<breadcrumbs>.*?)</div></a></div>(?P<content><div class="(\w| )+">.*?</div></div>)', re.DOTALL | re.MULTILINE)
     @classmethod
     def decode(cls, text):
-        matches = cls._decode_re.finditer(text)
-        results = []
-        for match in matches:
-            r = match.groupdict()
-            r['url'] = utils.web.urlunquote(utils.web.htmlToText(r['url'].split('&amp;')[0]))
-            results.append(r)
-        return results
+        parser = GoogleHTMLParser()
+        parser.feed(text)
+        return parser.results
 
 
     _gsearchUrl = 'https://www.google.com/search'
@@ -140,9 +137,8 @@ class Google(callbacks.PluginRegexp):
         if max:
             data = data[:max]
         for result in data:
-            title = utils.web.htmlToText(result['title']\
-                                         .encode('utf-8'))
-            url = result['url']
+            title = utils.web.htmlToText(result.title.encode('utf-8'))
+            url = result.link
             if minisix.PY2:
                 url = url.encode('utf-8')
             if title:
@@ -173,9 +169,9 @@ class Google(callbacks.PluginRegexp):
                            {'smallsearch': True})
         data = self.decode(data)
         if data:
-            url = data[0]['url']
+            url = data[0].link
             if 'snippet' in opts:
-                snippet = data[0]['content']
+                snippet = data[0].snippet
                 snippet = " | " + utils.web.htmlToText(snippet, tagReplace='')
             else:
                 snippet = ""
diff --git a/plugins/Google/test.py b/plugins/Google/test.py
index d3b09ebcc..4e2f389d5 100644
--- a/plugins/Google/test.py
+++ b/plugins/Google/test.py
@@ -57,7 +57,7 @@ class GoogleTestCase(ChannelPluginTestCase):
         def testUrlDecode(self):
             self.assertRegexp(
                     'google site:http://www.urbandictionary.com carajo land',
-                    '\x02Urban Dictionary: carajo land\x02: '
+                    '\x02carajo land - Urban Dictionary\x02: '
                     r'https?://www.urbandictionary.com/define.php\?term=carajo%20land')
 
         def testLucky(self):