mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-02-04 00:24:11 +01:00
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup.
So we don't depend on an external library.
This commit is contained in:
parent
37c1fa2153
commit
10df0a0dd0
87
plugins/DDG/parser.py
Normal file
87
plugins/DDG/parser.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import enum
|
||||||
|
import collections
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
result = collections.namedtuple('result', 'link title snippet')
|
||||||
|
|
||||||
|
@enum.unique
|
||||||
|
class ParserState(enum.Enum):
|
||||||
|
OUTSIDE = 0
|
||||||
|
IN_TITLE = 1
|
||||||
|
TITLE_PARSED = 2
|
||||||
|
IN_SNIPPET = 3
|
||||||
|
|
||||||
|
STACKED_TAGS = ('table', 'tr', 'td', 'a')
|
||||||
|
|
||||||
|
class DDGHTMLParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.stack = []
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
self.reset_current_result()
|
||||||
|
|
||||||
|
def reset_current_result(self):
|
||||||
|
self.state = ParserState.OUTSIDE
|
||||||
|
self.current_link = None
|
||||||
|
self.current_title = None
|
||||||
|
self.current_snippet = None
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
classes = attrs.get('class', '').split()
|
||||||
|
|
||||||
|
if tag in STACKED_TAGS:
|
||||||
|
self.stack.append((tag, classes))
|
||||||
|
|
||||||
|
if ('tr', ['result-sponsored']) in self.stack:
|
||||||
|
# Skip sponsored results
|
||||||
|
return
|
||||||
|
|
||||||
|
if tag == 'a' and 'result-link' in classes:
|
||||||
|
assert self.state == ParserState.OUTSIDE, (self.state, self.current_title)
|
||||||
|
self.state = ParserState.IN_TITLE
|
||||||
|
self.current_link = attrs['href']
|
||||||
|
self.current_title = []
|
||||||
|
|
||||||
|
elif tag == 'td' and 'result-snippet' in classes:
|
||||||
|
assert self.state == ParserState.TITLE_PARSED, self.state
|
||||||
|
self.state = ParserState.IN_SNIPPET
|
||||||
|
self.current_snippet = []
|
||||||
|
|
||||||
|
elif tag == 'span' and 'link-text' in classes:
|
||||||
|
if self.state == ParserState.TITLE_PARSED:
|
||||||
|
# No snippet
|
||||||
|
self.state = ParserState.OUTSIDE
|
||||||
|
self.current_snippet = []
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in STACKED_TAGS:
|
||||||
|
item = self.stack.pop()
|
||||||
|
assert item[0] == tag, (item, tag)
|
||||||
|
|
||||||
|
if tag == 'a' and self.state == ParserState.IN_TITLE:
|
||||||
|
self.state = ParserState.TITLE_PARSED
|
||||||
|
elif tag == 'td' and self.state == ParserState.IN_SNIPPET:
|
||||||
|
self.build_result()
|
||||||
|
self.state = ParserState.OUTSIDE
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.state == ParserState.IN_TITLE:
|
||||||
|
self.current_title.append(data)
|
||||||
|
elif self.state == ParserState.IN_SNIPPET:
|
||||||
|
self.current_snippet.append(data)
|
||||||
|
|
||||||
|
def build_result(self):
|
||||||
|
self.results.append(result(
|
||||||
|
link=self.current_link,
|
||||||
|
title=''.join(self.current_title),
|
||||||
|
snippet=''.join(self.current_snippet),
|
||||||
|
))
|
||||||
|
self.reset_current_result()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = DDGHTMLParser()
|
||||||
|
with open('ddg2.html') as fd:
|
||||||
|
parser.feed(fd.read())
|
||||||
|
print(parser.results)
|
@ -28,6 +28,10 @@
|
|||||||
|
|
||||||
###
|
###
|
||||||
|
|
||||||
|
import functools
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urlencode, parse_qs
|
||||||
|
|
||||||
import supybot.utils as utils
|
import supybot.utils as utils
|
||||||
from supybot.commands import *
|
from supybot.commands import *
|
||||||
import supybot.plugins as plugins
|
import supybot.plugins as plugins
|
||||||
@ -42,18 +46,7 @@ except ImportError:
|
|||||||
# without the i18n module
|
# without the i18n module
|
||||||
_ = lambda x: x
|
_ = lambda x: x
|
||||||
|
|
||||||
|
from .parser import DDGHTMLParser
|
||||||
try: # Python 3
|
|
||||||
from urllib.parse import urlencode, parse_qs
|
|
||||||
except ImportError: # Python 2
|
|
||||||
from urllib import urlencode
|
|
||||||
from urlparse import parse_qs
|
|
||||||
try:
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError("Beautiful Soup 4 is required for this plugin: get it"
|
|
||||||
" at http://www.crummy.com/software/BeautifulSoup/bs4"
|
|
||||||
"/doc/#installing-beautiful-soup")
|
|
||||||
|
|
||||||
|
|
||||||
class DDG(callbacks.Plugin):
|
class DDG(callbacks.Plugin):
|
||||||
@ -70,12 +63,11 @@ class DDG(callbacks.Plugin):
|
|||||||
|
|
||||||
real_url, data = utils.web.getUrlTargetAndContent(url)
|
real_url, data = utils.web.getUrlTargetAndContent(url)
|
||||||
data = data.decode("utf-8")
|
data = data.decode("utf-8")
|
||||||
soup = BeautifulSoup(data)
|
parser = DDGHTMLParser()
|
||||||
|
parser.feed(data)
|
||||||
|
|
||||||
# Remove "sponsored link" results
|
# Remove "sponsored link" results
|
||||||
return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in
|
return (url, real_url, parser.results)
|
||||||
str(td.parent.get('class'))])
|
|
||||||
|
|
||||||
|
|
||||||
def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
|
def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
|
||||||
"""
|
"""
|
||||||
@ -103,25 +95,12 @@ class DDG(callbacks.Plugin):
|
|||||||
return [('', '', real_url)]
|
return [('', '', real_url)]
|
||||||
|
|
||||||
for t in raw_results:
|
for t in raw_results:
|
||||||
res = ''
|
|
||||||
# Each valid result has a preceding heading in the format
|
|
||||||
# '<td valign="top">1. </td>', etc.
|
|
||||||
if t.text[0].isdigit():
|
|
||||||
res = t.next_sibling.next_sibling
|
|
||||||
if not res:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
snippet = ''
|
|
||||||
# 1) Get a result snippet.
|
|
||||||
|
|
||||||
if self.registryValue("showsnippet", channel_context):
|
if self.registryValue("showsnippet", channel_context):
|
||||||
snippet = res.parent.next_sibling.next_sibling.\
|
snippet = t.snippet.strip()
|
||||||
find_all("td")[-1]
|
else:
|
||||||
snippet = snippet.text.strip()
|
snippet = ''
|
||||||
# 2) Fetch the link title.
|
title = t.title.strip()
|
||||||
title = res.a.text.strip()
|
origlink = link = t.link
|
||||||
# 3) Fetch the result link.
|
|
||||||
origlink = link = res.a.get('href')
|
|
||||||
|
|
||||||
# As of 2017-01-20, some links on DuckDuckGo's site are shown going through
|
# As of 2017-01-20, some links on DuckDuckGo's site are shown going through
|
||||||
# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
|
# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
|
||||||
@ -141,9 +120,6 @@ class DDG(callbacks.Plugin):
|
|||||||
|
|
||||||
# Return a list of tuples in the form (link title, snippet text, link)
|
# Return a list of tuples in the form (link title, snippet text, link)
|
||||||
results.append((title, snippet, link))
|
results.append((title, snippet, link))
|
||||||
|
|
||||||
except AttributeError:
|
|
||||||
continue
|
|
||||||
return results[:maxr]
|
return results[:maxr]
|
||||||
|
|
||||||
@wrap(['text'])
|
@wrap(['text'])
|
||||||
|
Loading…
Reference in New Issue
Block a user