### # Copyright (c) 2014-2017, James Lu # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions, and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions, and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the author of this software nor the name of # contributors to this software may be used to endorse or promote products # derived from this software without specific prior written consent. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. ### import functools from html.parser import HTMLParser from urllib.parse import urlencode, parse_qs import supybot.utils as utils from supybot.commands import * import supybot.plugins as plugins import supybot.ircutils as ircutils import supybot.callbacks as callbacks import supybot.log as log try: from supybot.i18n import PluginInternationalization _ = PluginInternationalization('DDG') except ImportError: # Placeholder that allows to run the plugin on a bot # without the i18n module _ = lambda x: x from .parser import DDGHTMLParser class DDG(callbacks.Plugin): """Searches for results on DuckDuckGo.""" threaded = True @staticmethod def _ddgurl(text, region=None, safeSearch=None): # DuckDuckGo has a 'lite' site free of unparseable JavaScript # elements, so we'll use that to our advantage! url = "https://lite.duckduckgo.com/lite?" params = {"q": text} if region: params["kl"] = region if safeSearch: if safeSearch == "active": params["kp"] = 1 elif safeSearch == "moderate": params["kp"] = -1 elif safeSearch == "off": params["kp"] = -2 url += urlencode(params) log.debug("DDG: Using URL %s for search %s", url, text) real_url, data = utils.web.getUrlTargetAndContent(url) data = data.decode("utf-8") parser = DDGHTMLParser() parser.feed(data) # Remove "sponsored link" results return (url, real_url, parser.results) def search_core(self, text, channel_context=None, max_results=None, show_snippet=None): """ Core results fetcher for the DDG plugin. Other plugins can call this as well via irc.getCallback('DDG').search_core(...) """ if show_snippet is None: # Note: don't use ternary here, or the registry value will override any False # settings given to the function directly. show_snippet = self.registryValue("showSnippet", channel_context) maxr = max_results or self.registryValue("maxResults", channel_context) self.log.debug('DDG: got %s for max results', maxr) # In a nutshell, the 'lite' site puts all of its usable content # into tables. This does mean that headings, result snippets and # everything else are all using the same tag (), so parsing is # still somewhat tricky. results = [] region = self.registryValue("region", channel_context) safeSearch = self.registryValue("searchFilter", channel_context) url, real_url, raw_results = self._ddgurl(text, region, safeSearch) if real_url != url: # We received a redirect, likely from something like a !bang request. # Don't bother parsing the target page, as it probably won't work anyways. return [('', '', real_url)] for t in raw_results: if self.registryValue("showsnippet", channel_context): snippet = t.snippet.strip() else: snippet = '' title = t.title.strip() origlink = link = t.link # As of 2017-01-20, some links on DuckDuckGo's site are shown going through # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F" # instead of simply being "https://duckduckgo.com". So, we decode these links here. if link.startswith('/l/') or link.startswith('//duckduckgo.com/l/'): linkparse = utils.web.urlparse(link) try: link = parse_qs(linkparse.query)['uddg'][0] except KeyError: # No link was given here, skip. continue except IndexError: self.log.exception("DDG: failed to expand redirected result URL %s", origlink) continue else: self.log.debug("DDG: expanded result URL from %s to %s", origlink, link) # Return a list of tuples in the form (link title, snippet text, link) results.append((title, snippet, link)) return results[:maxr] @wrap(['text']) def search(self, irc, msg, args, text): """ Searches for on DuckDuckGo's web search.""" results = self.search_core(text, msg.args[0]) if not results: irc.error("No results found.") else: strings = [] for r in results: if not r[0]: # This result has no title, so it's likely a redirect from !bang. strings.append(format("See %u", r[2])) else: strings.append(format("%s - %s %u", ircutils.bold(r[0]), r[1], r[2])) irc.reply(', '.join(strings)) Class = DDG # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: