2014-12-11 03:59:11 +01:00
|
|
|
###
|
2017-01-21 03:39:42 +01:00
|
|
|
# Copyright (c) 2014-2017, James Lu <james@overdrivenetworks.com>
|
2014-12-11 03:59:11 +01:00
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
###
|
|
|
|
|
2020-05-15 21:26:44 +02:00
|
|
|
import functools
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
from urllib.parse import urlencode, parse_qs
|
|
|
|
|
2014-12-11 03:59:11 +01:00
|
|
|
import supybot.utils as utils
|
|
|
|
from supybot.commands import *
|
|
|
|
import supybot.plugins as plugins
|
|
|
|
import supybot.ircutils as ircutils
|
|
|
|
import supybot.callbacks as callbacks
|
2017-01-21 03:18:06 +01:00
|
|
|
import supybot.log as log
|
2014-12-11 03:59:11 +01:00
|
|
|
try:
|
|
|
|
from supybot.i18n import PluginInternationalization
|
|
|
|
_ = PluginInternationalization('DDG')
|
|
|
|
except ImportError:
|
|
|
|
# Placeholder that allows to run the plugin on a bot
|
|
|
|
# without the i18n module
|
2014-12-27 20:37:16 +01:00
|
|
|
_ = lambda x: x
|
2014-12-11 03:59:11 +01:00
|
|
|
|
2020-05-15 21:26:44 +02:00
|
|
|
from .parser import DDGHTMLParser
|
2014-12-27 20:37:16 +01:00
|
|
|
|
2014-12-11 03:59:11 +01:00
|
|
|
|
|
|
|
class DDG(callbacks.Plugin):
|
2014-12-11 04:29:53 +01:00
|
|
|
"""Searches for results on DuckDuckGo."""
|
2014-12-11 03:59:11 +01:00
|
|
|
threaded = True
|
2014-12-11 16:47:58 +01:00
|
|
|
|
2017-01-21 03:18:06 +01:00
|
|
|
@staticmethod
|
|
|
|
def _ddgurl(text):
|
2015-02-09 04:15:46 +01:00
|
|
|
# DuckDuckGo has a 'lite' site free of unparseable JavaScript
|
|
|
|
# elements, so we'll use that to our advantage!
|
2014-12-27 20:37:16 +01:00
|
|
|
url = "https://duckduckgo.com/lite?" + urlencode({"q": text})
|
2017-01-21 07:11:49 +01:00
|
|
|
|
2017-01-21 03:18:06 +01:00
|
|
|
log.debug("DDG: Using URL %s for search %s", url, text)
|
2017-01-21 07:11:49 +01:00
|
|
|
|
|
|
|
real_url, data = utils.web.getUrlTargetAndContent(url)
|
|
|
|
data = data.decode("utf-8")
|
2020-05-15 21:26:44 +02:00
|
|
|
parser = DDGHTMLParser()
|
|
|
|
parser.feed(data)
|
2017-01-21 07:11:49 +01:00
|
|
|
|
2015-08-23 03:32:04 +02:00
|
|
|
# Remove "sponsored link" results
|
2020-05-15 21:26:44 +02:00
|
|
|
return (url, real_url, parser.results)
|
2015-02-09 04:15:46 +01:00
|
|
|
|
2017-01-21 03:39:42 +01:00
|
|
|
def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
|
|
|
|
"""
|
|
|
|
Core results fetcher for the DDG plugin. Other plugins can call this as well via
|
|
|
|
irc.getCallback('DDG').search_core(...)
|
|
|
|
"""
|
|
|
|
if show_snippet is None:
|
2017-01-21 03:51:00 +01:00
|
|
|
# Note: don't use ternary here, or the registry value will override any False
|
2017-01-21 03:39:42 +01:00
|
|
|
# settings given to the function directly.
|
|
|
|
show_snippet = self.registryValue("showSnippet", channel_context)
|
|
|
|
maxr = max_results or self.registryValue("maxResults", channel_context)
|
2017-01-21 03:50:16 +01:00
|
|
|
self.log.debug('DDG: got %s for max results', maxr)
|
2015-02-09 04:15:46 +01:00
|
|
|
|
|
|
|
# In a nutshell, the 'lite' site puts all of its usable content
|
2017-01-21 07:11:49 +01:00
|
|
|
# into tables. This does mean that headings, result snippets and
|
|
|
|
# everything else are all using the same tag (<td>), so parsing is
|
|
|
|
# still somewhat tricky.
|
2017-01-21 03:39:42 +01:00
|
|
|
results = []
|
|
|
|
|
2017-01-21 07:11:49 +01:00
|
|
|
url, real_url, raw_results = self._ddgurl(text)
|
|
|
|
|
2017-01-21 07:16:05 +01:00
|
|
|
if real_url != url:
|
|
|
|
# We received a redirect, likely from something like a !bang request.
|
|
|
|
# Don't bother parsing the target page, as it probably won't work anyways.
|
|
|
|
return [('', '', real_url)]
|
|
|
|
|
2017-01-21 03:50:16 +01:00
|
|
|
for t in raw_results:
|
2020-05-15 21:26:44 +02:00
|
|
|
if self.registryValue("showsnippet", channel_context):
|
|
|
|
snippet = t.snippet.strip()
|
|
|
|
else:
|
2017-01-21 03:50:16 +01:00
|
|
|
snippet = ''
|
2020-05-15 21:26:44 +02:00
|
|
|
title = t.title.strip()
|
|
|
|
origlink = link = t.link
|
|
|
|
|
|
|
|
# As of 2017-01-20, some links on DuckDuckGo's site are shown going through
|
|
|
|
# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
|
|
|
|
# instead of simply being "https://duckduckgo.com". So, we decode these links here.
|
|
|
|
if link.startswith('/l/'):
|
|
|
|
linkparse = utils.web.urlparse(link)
|
|
|
|
try:
|
|
|
|
link = parse_qs(linkparse.query)['uddg'][0]
|
|
|
|
except KeyError:
|
|
|
|
# No link was given here, skip.
|
|
|
|
continue
|
|
|
|
except IndexError:
|
|
|
|
self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)
|
|
|
|
|
|
|
|
# Return a list of tuples in the form (link title, snippet text, link)
|
|
|
|
results.append((title, snippet, link))
|
2017-01-21 03:50:16 +01:00
|
|
|
return results[:maxr]
|
2017-01-21 03:39:42 +01:00
|
|
|
|
|
|
|
@wrap(['text'])
|
|
|
|
def search(self, irc, msg, args, text):
|
|
|
|
"""<text>
|
|
|
|
|
|
|
|
Searches for <text> on DuckDuckGo's web search."""
|
|
|
|
results = self.search_core(text, msg.args[0])
|
|
|
|
if not results:
|
|
|
|
irc.error("No results found.")
|
2014-12-16 02:28:06 +01:00
|
|
|
else:
|
2017-01-21 07:16:05 +01:00
|
|
|
strings = []
|
|
|
|
|
|
|
|
for r in results:
|
|
|
|
if not r[0]:
|
|
|
|
# This result has no title, so it's likely a redirect from !bang.
|
|
|
|
strings.append(format("See %u", r[2]))
|
|
|
|
else:
|
|
|
|
strings.append(format("%s - %s %u", ircutils.bold(r[0]), r[1], r[2]))
|
|
|
|
|
2017-01-21 03:57:55 +01:00
|
|
|
irc.reply(', '.join(strings))
|
2014-12-11 03:59:11 +01:00
|
|
|
|
|
|
|
Class = DDG
|
|
|
|
|
|
|
|
|
|
|
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|