From 2924845de417b4b56b5459df5f981a48ffaffc16 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Sun, 24 May 2020 18:57:27 +0200 Subject: [PATCH] Google: Update, with a proper HTML parser. Hopefully it will be more robust than the regexp. --- plugins/Google/parser.py | 126 +++++++++++++++++++++++++++++++++++++++ plugins/Google/plugin.py | 22 +++---- plugins/Google/test.py | 2 +- 3 files changed, 136 insertions(+), 14 deletions(-) create mode 100644 plugins/Google/parser.py diff --git a/plugins/Google/parser.py b/plugins/Google/parser.py new file mode 100644 index 000000000..45fc35ced --- /dev/null +++ b/plugins/Google/parser.py @@ -0,0 +1,126 @@ +### +# Copyright (c) 2020, Valentin Lorentz +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions, and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions, and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author of this software nor the name of +# contributors to this software may be used to endorse or promote products +# derived from this software without specific prior written consent. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +### + +import enum +import collections +from html.parser import HTMLParser + +import supybot.utils as utils + +result = collections.namedtuple('result', 'link title snippet') + +@enum.unique +class ParserState(enum.Enum): + OUTSIDE = 0 + IN_LINK = 1 + IN_TITLE = 2 + TITLE_PARSED = 3 + BREADCRUMBS_PARSED = 5 + LINK_PARSED = 6 + +@enum.unique +class DomMark(enum.Enum): + """A mark on an element in the stack, to know when to change state when + poping the element from the stack.""" + HEADING = 1 + BREADCRUMBS = 2 + +STACKED_TAGS = ('div', 'span', 'a') + +class GoogleHTMLParser(HTMLParser): + def __init__(self): + super().__init__() + self.stack = [] + self.results = [] + + self.reset_current_result() + + def reset_current_result(self): + self.state = ParserState.OUTSIDE + self.current_link = None + self.current_title = None + self.current_snippet = None + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + classes = attrs.get('class', '').split() + + if tag in STACKED_TAGS: + self.stack.append(tag) + + if tag == 'a' and attrs['href'].startswith('/url?q='): + assert self.state == ParserState.OUTSIDE, (self.state, self.current_title) + self.state = ParserState.IN_LINK + href = attrs['href'][len('/url?q='):] + self.current_link = utils.web.urlunquote(utils.web.htmlToText(href.split('&sa')[0])) + + elif tag == 'div' and 'a' in self.stack and attrs.get('role') == 'heading' \ + and self.state == ParserState.IN_LINK: + self.state = ParserState.IN_TITLE + mark = DomMark.HEADING + + def handle_endtag(self, tag): + if tag in STACKED_TAGS: + item = self.stack.pop() + assert item == tag, (item, tag) + + if tag == 'a' and self.state in ( + ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED): + if self.current_title is None: + # That wasn't a result + self.state = ParserState.OUTSIDE + else: + self.state = ParserState.LINK_PARSED + + def handle_data(self, data): + if self.state == ParserState.IN_TITLE: + self.current_title = data + self.state = ParserState.TITLE_PARSED + elif self.state == ParserState.TITLE_PARSED: + self.state = ParserState.BREADCRUMBS_PARSED + elif self.state == ParserState.LINK_PARSED: + self.current_snippet = data + self.state = ParserState.OUTSIDE + self.build_result() + + def build_result(self): + self.results.append(result( + link=self.current_link, + title=self.current_title, + snippet=self.current_snippet, + )) + self.reset_current_result() + +if __name__ == '__main__': + parser = GoogleHTMLParser() + with open('google.html') as fd: + parser.feed(fd.read()) + print(parser.results) + diff --git a/plugins/Google/plugin.py b/plugins/Google/plugin.py index d82a3cdbb..4f74a5d6c 100644 --- a/plugins/Google/plugin.py +++ b/plugins/Google/plugin.py @@ -43,6 +43,8 @@ import supybot.callbacks as callbacks from supybot.i18n import PluginInternationalization, internationalizeDocstring _ = PluginInternationalization('Google') +from .parser import GoogleHTMLParser + class Google(callbacks.PluginRegexp): """This is a simple plugin to provide access to the Google services we all know and love from our favorite IRC bot.""" @@ -75,16 +77,11 @@ class Google(callbacks.PluginRegexp): msg = ircmsgs.privmsg(msg.args[0], s, msg=msg) return msg - _decode_re = re.compile(r'
]*>
(?P.*?)</div><div class="(\w| )+">(?P<breadcrumbs>.*?)</div></a></div>(?P<content><div class="(\w| )+">.*?</div></div>)', re.DOTALL | re.MULTILINE) @classmethod def decode(cls, text): - matches = cls._decode_re.finditer(text) - results = [] - for match in matches: - r = match.groupdict() - r['url'] = utils.web.urlunquote(utils.web.htmlToText(r['url'].split('&')[0])) - results.append(r) - return results + parser = GoogleHTMLParser() + parser.feed(text) + return parser.results _gsearchUrl = 'https://www.google.com/search' @@ -140,9 +137,8 @@ class Google(callbacks.PluginRegexp): if max: data = data[:max] for result in data: - title = utils.web.htmlToText(result['title']\ - .encode('utf-8')) - url = result['url'] + title = utils.web.htmlToText(result.title.encode('utf-8')) + url = result.link if minisix.PY2: url = url.encode('utf-8') if title: @@ -173,9 +169,9 @@ class Google(callbacks.PluginRegexp): {'smallsearch': True}) data = self.decode(data) if data: - url = data[0]['url'] + url = data[0].link if 'snippet' in opts: - snippet = data[0]['content'] + snippet = data[0].snippet snippet = " | " + utils.web.htmlToText(snippet, tagReplace='') else: snippet = "" diff --git a/plugins/Google/test.py b/plugins/Google/test.py index d3b09ebcc..4e2f389d5 100644 --- a/plugins/Google/test.py +++ b/plugins/Google/test.py @@ -57,7 +57,7 @@ class GoogleTestCase(ChannelPluginTestCase): def testUrlDecode(self): self.assertRegexp( 'google site:http://www.urbandictionary.com carajo land', - '\x02Urban Dictionary: carajo land\x02: ' + '\x02carajo land - Urban Dictionary\x02: ' r'https?://www.urbandictionary.com/define.php\?term=carajo%20land') def testLucky(self):