Google: Update, with a proper HTML parser.

Hopefully it will be more robust than the regexp.
2026-01-19 17:18:00 +01:00 · 2020-05-24 18:57:27 +02:00 · 2020-05-24 18:57:27 +02:00 · 2924845de4
commit 2924845de4
parent ed87de1527
3 changed files with 136 additions and 14 deletions
--- a/plugins/Google/parser.py
+++ b/plugins/Google/parser.py
@ -0,0 +1,126 @@
 ###
 # Copyright (c) 2020, Valentin Lorentz
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #   * Redistributions of source code must retain the above copyright notice,
 #     this list of conditions, and the following disclaimer.
 #   * Redistributions in binary form must reproduce the above copyright notice,
 #     this list of conditions, and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #   * Neither the name of the author of this software nor the name of
 #     contributors to this software may be used to endorse or promote products
 #     derived from this software without specific prior written consent.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 ###
 import enum
 import collections
 from html.parser import HTMLParser
 import supybot.utils as utils
 result = collections.namedtuple('result', 'link title snippet')
@enum.unique
 class ParserState(enum.Enum):
    OUTSIDE = 0
    IN_LINK = 1
    IN_TITLE = 2
    TITLE_PARSED = 3
    BREADCRUMBS_PARSED = 5
    LINK_PARSED = 6
@enum.unique
 class DomMark(enum.Enum):
    """A mark on an element in the stack, to know when to change state when
    poping the element from the stack."""
    HEADING = 1
    BREADCRUMBS = 2
 STACKED_TAGS = ('div', 'span', 'a')
 class GoogleHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.stack = []
        self.results = []
        self.reset_current_result()
    def reset_current_result(self):
        self.state = ParserState.OUTSIDE
        self.current_link = None
        self.current_title = None
        self.current_snippet = None
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        classes = attrs.get('class', '').split()
        if tag in STACKED_TAGS:
            self.stack.append(tag)
        if tag == 'a' and attrs['href'].startswith('/url?q='):
            assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
            self.state = ParserState.IN_LINK
            href = attrs['href'][len('/url?q='):]
            self.current_link = utils.web.urlunquote(utils.web.htmlToText(href.split('&sa')[0]))
        elif tag == 'div' and 'a' in self.stack and attrs.get('role') == 'heading' \
                and self.state == ParserState.IN_LINK:
            self.state = ParserState.IN_TITLE
            mark = DomMark.HEADING
    def handle_endtag(self, tag):
        if tag in STACKED_TAGS:
            item = self.stack.pop()
            assert item == tag, (item, tag)
        if tag == 'a' and self.state in (
                ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED):
            if self.current_title is None:
                # That wasn't a result
                self.state = ParserState.OUTSIDE
            else:
                self.state = ParserState.LINK_PARSED
    def handle_data(self, data):
        if self.state == ParserState.IN_TITLE:
            self.current_title = data
            self.state = ParserState.TITLE_PARSED
        elif self.state == ParserState.TITLE_PARSED:
            self.state = ParserState.BREADCRUMBS_PARSED
        elif self.state == ParserState.LINK_PARSED:
            self.current_snippet = data
            self.state = ParserState.OUTSIDE
            self.build_result()
    def build_result(self):
        self.results.append(result(
            link=self.current_link,
            title=self.current_title,
            snippet=self.current_snippet,
        ))
        self.reset_current_result()
 if __name__ == '__main__':
    parser = GoogleHTMLParser()
    with open('google.html') as fd:
        parser.feed(fd.read())
    print(parser.results)
--- a/plugins/Google/plugin.py
+++ b/plugins/Google/plugin.py
@ -43,6 +43,8 @@ import supybot.callbacks as callbacks
 from supybot.i18n import PluginInternationalization, internationalizeDocstring
 _ = PluginInternationalization('Google')
 from .parser import GoogleHTMLParser
 class Google(callbacks.PluginRegexp):
    """This is a simple plugin to provide access to the Google services we
    all know and love from our favorite IRC bot."""
@ -75,16 +77,11 @@ class Google(callbacks.PluginRegexp):
            msg = ircmsgs.privmsg(msg.args[0], s, msg=msg)
        return msg
    _decode_re = re.compile(r'<div class="\w+"><a href="/url\?q=(?P<url>[^"]+)&[^"]+"[^>]*><div class="(\w| )+">(?P<title>.*?)</div><div class="(\w| )+">(?P<breadcrumbs>.*?)</div></a></div>(?P<content><div class="(\w| )+">.*?</div></div>)', re.DOTALL | re.MULTILINE)
    @classmethod
    def decode(cls, text):
-        matches = cls._decode_re.finditer(text)
+        parser = GoogleHTMLParser()
-        results = []
+        parser.feed(text)
-        for match in matches:
+        return parser.results
            r = match.groupdict()
            r['url'] = utils.web.urlunquote(utils.web.htmlToText(r['url'].split('&amp;')[0]))
            results.append(r)
        return results
    _gsearchUrl = 'https://www.google.com/search'
@ -140,9 +137,8 @@ class Google(callbacks.PluginRegexp):
        if max:
            data = data[:max]
        for result in data:
-            title = utils.web.htmlToText(result['title']\
+            title = utils.web.htmlToText(result.title.encode('utf-8'))
-                                         .encode('utf-8'))
+            url = result.link
            url = result['url']
            if minisix.PY2:
                url = url.encode('utf-8')
            if title:
@ -173,9 +169,9 @@ class Google(callbacks.PluginRegexp):
                           {'smallsearch': True})
        data = self.decode(data)
        if data:
-            url = data[0]['url']
+            url = data[0].link
            if 'snippet' in opts:
-                snippet = data[0]['content']
+                snippet = data[0].snippet
                snippet = " | " + utils.web.htmlToText(snippet, tagReplace='')
            else:
                snippet = ""
--- a/plugins/Google/test.py
+++ b/plugins/Google/test.py
@ -57,7 +57,7 @@ class GoogleTestCase(ChannelPluginTestCase):
        def testUrlDecode(self):
            self.assertRegexp(
                    'google site:http://www.urbandictionary.com carajo land',
-                    '\x02Urban Dictionary: carajo land\x02: '
+                    '\x02carajo land - Urban Dictionary\x02: '
                    r'https?://www.urbandictionary.com/define.php\?term=carajo%20land')
        def testLucky(self):