DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup.

So we don't depend on an external library.
This commit is contained in:
Valentin Lorentz 2020-05-15 21:26:44 +02:00
parent 37c1fa2153
commit 10df0a0dd0
2 changed files with 117 additions and 54 deletions

87
plugins/DDG/parser.py Normal file
View File

@ -0,0 +1,87 @@
import enum
import collections
from html.parser import HTMLParser
result = collections.namedtuple('result', 'link title snippet')
@enum.unique
class ParserState(enum.Enum):
OUTSIDE = 0
IN_TITLE = 1
TITLE_PARSED = 2
IN_SNIPPET = 3
STACKED_TAGS = ('table', 'tr', 'td', 'a')
class DDGHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.stack = []
self.results = []
self.reset_current_result()
def reset_current_result(self):
self.state = ParserState.OUTSIDE
self.current_link = None
self.current_title = None
self.current_snippet = None
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
classes = attrs.get('class', '').split()
if tag in STACKED_TAGS:
self.stack.append((tag, classes))
if ('tr', ['result-sponsored']) in self.stack:
# Skip sponsored results
return
if tag == 'a' and 'result-link' in classes:
assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
self.state = ParserState.IN_TITLE
self.current_link = attrs['href']
self.current_title = []
elif tag == 'td' and 'result-snippet' in classes:
assert self.state == ParserState.TITLE_PARSED, self.state
self.state = ParserState.IN_SNIPPET
self.current_snippet = []
elif tag == 'span' and 'link-text' in classes:
if self.state == ParserState.TITLE_PARSED:
# No snippet
self.state = ParserState.OUTSIDE
self.current_snippet = []
def handle_endtag(self, tag):
if tag in STACKED_TAGS:
item = self.stack.pop()
assert item[0] == tag, (item, tag)
if tag == 'a' and self.state == ParserState.IN_TITLE:
self.state = ParserState.TITLE_PARSED
elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
self.build_result()
self.state = ParserState.OUTSIDE
def handle_data(self, data):
if self.state == ParserState.IN_TITLE:
self.current_title.append(data)
elif self.state == ParserState.IN_SNIPPET:
self.current_snippet.append(data)
def build_result(self):
self.results.append(result(
link=self.current_link,
title=''.join(self.current_title),
snippet=''.join(self.current_snippet),
))
self.reset_current_result()
if __name__ == '__main__':
parser = DDGHTMLParser()
with open('ddg2.html') as fd:
parser.feed(fd.read())
print(parser.results)

View File

@ -28,6 +28,10 @@
###
import functools
from html.parser import HTMLParser
from urllib.parse import urlencode, parse_qs
import supybot.utils as utils
from supybot.commands import *
import supybot.plugins as plugins
@ -42,18 +46,7 @@ except ImportError:
# without the i18n module
_ = lambda x: x
try: # Python 3
from urllib.parse import urlencode, parse_qs
except ImportError: # Python 2
from urllib import urlencode
from urlparse import parse_qs
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("Beautiful Soup 4 is required for this plugin: get it"
" at http://www.crummy.com/software/BeautifulSoup/bs4"
"/doc/#installing-beautiful-soup")
from .parser import DDGHTMLParser
class DDG(callbacks.Plugin):
@ -70,12 +63,11 @@ class DDG(callbacks.Plugin):
real_url, data = utils.web.getUrlTargetAndContent(url)
data = data.decode("utf-8")
soup = BeautifulSoup(data)
parser = DDGHTMLParser()
parser.feed(data)
# Remove "sponsored link" results
return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in
str(td.parent.get('class'))])
return (url, real_url, parser.results)
def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
"""
@ -103,47 +95,31 @@ class DDG(callbacks.Plugin):
return [('', '', real_url)]
for t in raw_results:
res = ''
# Each valid result has a preceding heading in the format
# '<td valign="top">1.&nbsp;</td>', etc.
if t.text[0].isdigit():
res = t.next_sibling.next_sibling
if not res:
continue
try:
if self.registryValue("showsnippet", channel_context):
snippet = t.snippet.strip()
else:
snippet = ''
# 1) Get a result snippet.
title = t.title.strip()
origlink = link = t.link
if self.registryValue("showsnippet", channel_context):
snippet = res.parent.next_sibling.next_sibling.\
find_all("td")[-1]
snippet = snippet.text.strip()
# 2) Fetch the link title.
title = res.a.text.strip()
# 3) Fetch the result link.
origlink = link = res.a.get('href')
# As of 2017-01-20, some links on DuckDuckGo's site are shown going through
# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
# instead of simply being "https://duckduckgo.com". So, we decode these links here.
if link.startswith('/l/'):
linkparse = utils.web.urlparse(link)
try:
link = parse_qs(linkparse.query)['uddg'][0]
except KeyError:
# No link was given here, skip.
continue
except IndexError:
self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
continue
else:
self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)
# As of 2017-01-20, some links on DuckDuckGo's site are shown going through
# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
# instead of simply being "https://duckduckgo.com". So, we decode these links here.
if link.startswith('/l/'):
linkparse = utils.web.urlparse(link)
try:
link = parse_qs(linkparse.query)['uddg'][0]
except KeyError:
# No link was given here, skip.
continue
except IndexError:
self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
continue
else:
self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)
# Return a list of tuples in the form (link title, snippet text, link)
results.append((title, snippet, link))
except AttributeError:
continue
# Return a list of tuples in the form (link title, snippet text, link)
results.append((title, snippet, link))
return results[:maxr]
@wrap(['text'])