DDG: rewrite _ddgurl() to return new and original request URLs

This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725 or later

From: 2db371a9fa
This commit is contained in:
James Lu 2017-01-20 22:11:49 -08:00
parent 53318d142c
commit ba04480405

View File

@ -65,11 +65,17 @@ class DDG(callbacks.Plugin):
# DuckDuckGo has a 'lite' site free of unparseable JavaScript # DuckDuckGo has a 'lite' site free of unparseable JavaScript
# elements, so we'll use that to our advantage! # elements, so we'll use that to our advantage!
url = "https://duckduckgo.com/lite?" + urlencode({"q": text}) url = "https://duckduckgo.com/lite?" + urlencode({"q": text})
log.debug("DDG: Using URL %s for search %s", url, text) log.debug("DDG: Using URL %s for search %s", url, text)
data = utils.web.getUrl(url).decode("utf-8")
real_url, data = utils.web.getUrlTargetAndContent(url)
data = data.decode("utf-8")
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
# Remove "sponsored link" results # Remove "sponsored link" results
return [td for td in soup.find_all('td') if 'result-sponsored' not in str(td.parent.get('class'))] return (url, real_url, [td for td in soup.find_all('td') if 'result-sponsored' not in
str(td.parent.get('class'))])
def search_core(self, text, channel_context=None, max_results=None, show_snippet=None): def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
""" """
@ -84,12 +90,13 @@ class DDG(callbacks.Plugin):
self.log.debug('DDG: got %s for max results', maxr) self.log.debug('DDG: got %s for max results', maxr)
# In a nutshell, the 'lite' site puts all of its usable content # In a nutshell, the 'lite' site puts all of its usable content
# into tables. This means that headings, result snippets and # into tables. This does mean that headings, result snippets and
# everything else are all using the same tag (<td>), which still makes # everything else are all using the same tag (<td>), so parsing is
# parsing somewhat tricky. # still somewhat tricky.
results = [] results = []
raw_results = self._ddgurl(text) url, real_url, raw_results = self._ddgurl(text)
for t in raw_results: for t in raw_results:
res = '' res = ''
# Each valid result has a preceding heading in the format # Each valid result has a preceding heading in the format
@ -158,7 +165,7 @@ class DDG(callbacks.Plugin):
# Zero-click info: 8 (number) # Zero-click info: 8 (number)
# Zero-click info: 8 # Zero-click info: 8
replies = {} replies = {}
for td in self._ddgurl(text): for td in self._ddgurl(text)[-1]:
if td.text.startswith("Zero-click info:"): if td.text.startswith("Zero-click info:"):
# Make a dictionary of things # Make a dictionary of things
item = td.text.split("Zero-click info:", 1)[1].strip() item = td.text.split("Zero-click info:", 1)[1].strip()