mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-19 08:59:27 +01:00
Google: Update, with a proper HTML parser.
Hopefully it will be more robust than the regexp.
This commit is contained in:
parent
ed87de1527
commit
2924845de4
126
plugins/Google/parser.py
Normal file
126
plugins/Google/parser.py
Normal file
@ -0,0 +1,126 @@
|
||||
###
|
||||
# Copyright (c) 2020, Valentin Lorentz
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions, and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions, and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the author of this software nor the name of
|
||||
# contributors to this software may be used to endorse or promote products
|
||||
# derived from this software without specific prior written consent.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
###
|
||||
|
||||
import enum
|
||||
import collections
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import supybot.utils as utils
|
||||
|
||||
result = collections.namedtuple('result', 'link title snippet')
|
||||
|
||||
@enum.unique
|
||||
class ParserState(enum.Enum):
|
||||
OUTSIDE = 0
|
||||
IN_LINK = 1
|
||||
IN_TITLE = 2
|
||||
TITLE_PARSED = 3
|
||||
BREADCRUMBS_PARSED = 5
|
||||
LINK_PARSED = 6
|
||||
|
||||
@enum.unique
|
||||
class DomMark(enum.Enum):
|
||||
"""A mark on an element in the stack, to know when to change state when
|
||||
poping the element from the stack."""
|
||||
HEADING = 1
|
||||
BREADCRUMBS = 2
|
||||
|
||||
STACKED_TAGS = ('div', 'span', 'a')
|
||||
|
||||
class GoogleHTMLParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.stack = []
|
||||
self.results = []
|
||||
|
||||
self.reset_current_result()
|
||||
|
||||
def reset_current_result(self):
|
||||
self.state = ParserState.OUTSIDE
|
||||
self.current_link = None
|
||||
self.current_title = None
|
||||
self.current_snippet = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
classes = attrs.get('class', '').split()
|
||||
|
||||
if tag in STACKED_TAGS:
|
||||
self.stack.append(tag)
|
||||
|
||||
if tag == 'a' and attrs['href'].startswith('/url?q='):
|
||||
assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
|
||||
self.state = ParserState.IN_LINK
|
||||
href = attrs['href'][len('/url?q='):]
|
||||
self.current_link = utils.web.urlunquote(utils.web.htmlToText(href.split('&sa')[0]))
|
||||
|
||||
elif tag == 'div' and 'a' in self.stack and attrs.get('role') == 'heading' \
|
||||
and self.state == ParserState.IN_LINK:
|
||||
self.state = ParserState.IN_TITLE
|
||||
mark = DomMark.HEADING
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in STACKED_TAGS:
|
||||
item = self.stack.pop()
|
||||
assert item == tag, (item, tag)
|
||||
|
||||
if tag == 'a' and self.state in (
|
||||
ParserState.IN_LINK, ParserState.IN_TITLE, ParserState.BREADCRUMBS_PARSED):
|
||||
if self.current_title is None:
|
||||
# That wasn't a result
|
||||
self.state = ParserState.OUTSIDE
|
||||
else:
|
||||
self.state = ParserState.LINK_PARSED
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.state == ParserState.IN_TITLE:
|
||||
self.current_title = data
|
||||
self.state = ParserState.TITLE_PARSED
|
||||
elif self.state == ParserState.TITLE_PARSED:
|
||||
self.state = ParserState.BREADCRUMBS_PARSED
|
||||
elif self.state == ParserState.LINK_PARSED:
|
||||
self.current_snippet = data
|
||||
self.state = ParserState.OUTSIDE
|
||||
self.build_result()
|
||||
|
||||
def build_result(self):
|
||||
self.results.append(result(
|
||||
link=self.current_link,
|
||||
title=self.current_title,
|
||||
snippet=self.current_snippet,
|
||||
))
|
||||
self.reset_current_result()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = GoogleHTMLParser()
|
||||
with open('google.html') as fd:
|
||||
parser.feed(fd.read())
|
||||
print(parser.results)
|
||||
|
@ -43,6 +43,8 @@ import supybot.callbacks as callbacks
|
||||
from supybot.i18n import PluginInternationalization, internationalizeDocstring
|
||||
_ = PluginInternationalization('Google')
|
||||
|
||||
from .parser import GoogleHTMLParser
|
||||
|
||||
class Google(callbacks.PluginRegexp):
|
||||
"""This is a simple plugin to provide access to the Google services we
|
||||
all know and love from our favorite IRC bot."""
|
||||
@ -75,16 +77,11 @@ class Google(callbacks.PluginRegexp):
|
||||
msg = ircmsgs.privmsg(msg.args[0], s, msg=msg)
|
||||
return msg
|
||||
|
||||
_decode_re = re.compile(r'<div class="\w+"><a href="/url\?q=(?P<url>[^"]+)&[^"]+"[^>]*><div class="(\w| )+">(?P<title>.*?)</div><div class="(\w| )+">(?P<breadcrumbs>.*?)</div></a></div>(?P<content><div class="(\w| )+">.*?</div></div>)', re.DOTALL | re.MULTILINE)
|
||||
@classmethod
|
||||
def decode(cls, text):
|
||||
matches = cls._decode_re.finditer(text)
|
||||
results = []
|
||||
for match in matches:
|
||||
r = match.groupdict()
|
||||
r['url'] = utils.web.urlunquote(utils.web.htmlToText(r['url'].split('&')[0]))
|
||||
results.append(r)
|
||||
return results
|
||||
parser = GoogleHTMLParser()
|
||||
parser.feed(text)
|
||||
return parser.results
|
||||
|
||||
|
||||
_gsearchUrl = 'https://www.google.com/search'
|
||||
@ -140,9 +137,8 @@ class Google(callbacks.PluginRegexp):
|
||||
if max:
|
||||
data = data[:max]
|
||||
for result in data:
|
||||
title = utils.web.htmlToText(result['title']\
|
||||
.encode('utf-8'))
|
||||
url = result['url']
|
||||
title = utils.web.htmlToText(result.title.encode('utf-8'))
|
||||
url = result.link
|
||||
if minisix.PY2:
|
||||
url = url.encode('utf-8')
|
||||
if title:
|
||||
@ -173,9 +169,9 @@ class Google(callbacks.PluginRegexp):
|
||||
{'smallsearch': True})
|
||||
data = self.decode(data)
|
||||
if data:
|
||||
url = data[0]['url']
|
||||
url = data[0].link
|
||||
if 'snippet' in opts:
|
||||
snippet = data[0]['content']
|
||||
snippet = data[0].snippet
|
||||
snippet = " | " + utils.web.htmlToText(snippet, tagReplace='')
|
||||
else:
|
||||
snippet = ""
|
||||
|
@ -57,7 +57,7 @@ class GoogleTestCase(ChannelPluginTestCase):
|
||||
def testUrlDecode(self):
|
||||
self.assertRegexp(
|
||||
'google site:http://www.urbandictionary.com carajo land',
|
||||
'\x02Urban Dictionary: carajo land\x02: '
|
||||
'\x02carajo land - Urban Dictionary\x02: '
|
||||
r'https?://www.urbandictionary.com/define.php\?term=carajo%20land')
|
||||
|
||||
def testLucky(self):
|
||||
|
Loading…
Reference in New Issue
Block a user