Limnoria/plugins/DDG/plugin.py

###
# Copyright (c) 2014-2017, James Lu <james@overdrivenetworks.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

###

import functools
from html.parser import HTMLParser
from urllib.parse import urlencode, parse_qs

import supybot.utils as utils
from supybot.commands import *
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
import supybot.log as log
try:
    from supybot.i18n import PluginInternationalization
    _ = PluginInternationalization('DDG')
except ImportError:
    # Placeholder that allows to run the plugin on a bot
    # without the i18n module
    _ = lambda x: x

from .parser import DDGHTMLParser


class DDG(callbacks.Plugin):
    """Searches for results on DuckDuckGo."""
    threaded = True

    @staticmethod
    def _ddgurl(text):
        # DuckDuckGo has a 'lite' site free of unparseable JavaScript
        # elements, so we'll use that to our advantage!
        url = "https://duckduckgo.com/lite?" + urlencode({"q": text})

        log.debug("DDG: Using URL %s for search %s", url, text)

        real_url, data = utils.web.getUrlTargetAndContent(url)
        data = data.decode("utf-8")
        parser = DDGHTMLParser()
        parser.feed(data)

        # Remove "sponsored link" results
        return (url, real_url, parser.results)

    def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):
        """
        Core results fetcher for the DDG plugin. Other plugins can call this as well via
        irc.getCallback('DDG').search_core(...)
        """
        if show_snippet is None:
            # Note: don't use ternary here, or the registry value will override any False
            # settings given to the function directly.
            show_snippet = self.registryValue("showSnippet", channel_context)
        maxr = max_results or self.registryValue("maxResults", channel_context)
        self.log.debug('DDG: got %s for max results', maxr)

        # In a nutshell, the 'lite' site puts all of its usable content
        # into tables. This does mean that headings, result snippets and
        # everything else are all using the same tag (<td>), so parsing is
        # still somewhat tricky.
        results = []

        url, real_url, raw_results = self._ddgurl(text)

        if real_url != url:
            # We received a redirect, likely from something like a !bang request.
            # Don't bother parsing the target page, as it probably won't work anyways.
            return [('', '', real_url)]

        for t in raw_results:
            if self.registryValue("showsnippet", channel_context):
                snippet = t.snippet.strip()
            else:
                snippet = ''
            title = t.title.strip()
            origlink = link = t.link

            # As of 2017-01-20, some links on DuckDuckGo's site are shown going through
            # a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"
            # instead of simply being "https://duckduckgo.com". So, we decode these links here.
            if link.startswith('/l/'):
                linkparse = utils.web.urlparse(link)
                try:
                    link = parse_qs(linkparse.query)['uddg'][0]
                except KeyError:
                    # No link was given here, skip.
                    continue
                except IndexError:
                    self.log.exception("DDG: failed to expand redirected result URL %s", origlink)
                    continue
                else:
                    self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)

            # Return a list of tuples in the form (link title, snippet text, link)
            results.append((title, snippet, link))
        return results[:maxr]

    @wrap(['text'])
    def search(self, irc, msg, args, text):
        """<text>

        Searches for <text> on DuckDuckGo's web search."""
        results = self.search_core(text, msg.args[0])
        if not results:
            irc.error("No results found.")
        else:
            strings = []

            for r in results:
                if not r[0]:
                    # This result has no title, so it's likely a redirect from !bang.
                    strings.append(format("See %u", r[2]))
                else:
                    strings.append(format("%s - %s %u", ircutils.bold(r[0]), r[1], r[2]))

            irc.reply(', '.join(strings))

Class = DDG


# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00			`###`
DDG: abstract out a search_core(), as the LastFM plugin will use this later From: https://github.com/jlu5/SupyPlugins/commit/47bb74d7d95b67a8e2fd8a5e5f2210a41f1b11a1 2017-01-21 03:39:42 +01:00			`# Copyright (c) 2014-2017, James Lu <james@overdrivenetworks.com>`
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`###`

DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`import functools`
			`from html.parser import HTMLParser`
			`from urllib.parse import urlencode, parse_qs`

DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00			`import supybot.utils as utils`
			`from supybot.commands import *`
			`import supybot.plugins as plugins`
			`import supybot.ircutils as ircutils`
			`import supybot.callbacks as callbacks`
DDG: make _ddgurl() a static method From: https://github.com/jlu5/SupyPlugins/commit/d332e73e48dfa679f3a08786a9dae9da5149b0cc 2017-01-21 03:18:06 +01:00			`import supybot.log as log`
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00			`try:`
			`from supybot.i18n import PluginInternationalization`
			`_ = PluginInternationalization('DDG')`
			`except ImportError:`
			`# Placeholder that allows to run the plugin on a bot`
			`# without the i18n module`
DDG/FML/Isup: work towards PEP8 compliancy Skipping the config options in config.py since the indented version looks just as bad (not enough space to write the text without making it use 5 lines). From: https://github.com/jlu5/SupyPlugins/commit/4a62b4ad13c3d6f40f914f0bf8037e097d675c80 2014-12-27 20:37:16 +01:00			`_ = lambda x: x`
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`from .parser import DDGHTMLParser`
DDG/FML/Isup: work towards PEP8 compliancy Skipping the config options in config.py since the indented version looks just as bad (not enough space to write the text without making it use 5 lines). From: https://github.com/jlu5/SupyPlugins/commit/4a62b4ad13c3d6f40f914f0bf8037e097d675c80 2014-12-27 20:37:16 +01:00
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00
			`class DDG(callbacks.Plugin):`
DDG: remove unused variable From: https://github.com/jlu5/SupyPlugins/commit/3c6126fbc46f73e80b388401ff013eb2f1cdb857 2014-12-11 04:29:53 +01:00			`"""Searches for results on DuckDuckGo."""`
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00			`threaded = True`
DDG: strip ads better + skip zeroclick info From: https://github.com/jlu5/SupyPlugins/commit/625ef777cdc292ca7b8562e697c44b748a84edf8 2014-12-11 16:47:58 +01:00
DDG: make _ddgurl() a static method From: https://github.com/jlu5/SupyPlugins/commit/d332e73e48dfa679f3a08786a9dae9da5149b0cc 2017-01-21 03:18:06 +01:00			`@staticmethod`
			`def _ddgurl(text):`
DDG: Initial addition of 'zeroclick' command (Closes #24) From: https://github.com/jlu5/SupyPlugins/commit/285948a47d2159250ff26abe945e499115323263 2015-02-09 04:15:46 +01:00			`# DuckDuckGo has a 'lite' site free of unparseable JavaScript`
			`# elements, so we'll use that to our advantage!`
DDG/FML/Isup: work towards PEP8 compliancy Skipping the config options in config.py since the indented version looks just as bad (not enough space to write the text without making it use 5 lines). From: https://github.com/jlu5/SupyPlugins/commit/4a62b4ad13c3d6f40f914f0bf8037e097d675c80 2014-12-27 20:37:16 +01:00			`url = "https://duckduckgo.com/lite?" + urlencode({"q": text})`
DDG: rewrite _ddgurl() to return new and original request URLs This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725d2e6f2f417419d2a0459982898b877 or later From: https://github.com/jlu5/SupyPlugins/commit/2db371a9fad73f15e13c36d2d45d4f8baeaa9938 2017-01-21 07:11:49 +01:00
DDG: make _ddgurl() a static method From: https://github.com/jlu5/SupyPlugins/commit/d332e73e48dfa679f3a08786a9dae9da5149b0cc 2017-01-21 03:18:06 +01:00			`log.debug("DDG: Using URL %s for search %s", url, text)`
DDG: rewrite _ddgurl() to return new and original request URLs This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725d2e6f2f417419d2a0459982898b877 or later From: https://github.com/jlu5/SupyPlugins/commit/2db371a9fad73f15e13c36d2d45d4f8baeaa9938 2017-01-21 07:11:49 +01:00
			`real_url, data = utils.web.getUrlTargetAndContent(url)`
			`data = data.decode("utf-8")`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`parser = DDGHTMLParser()`
			`parser.feed(data)`
DDG: rewrite _ddgurl() to return new and original request URLs This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725d2e6f2f417419d2a0459982898b877 or later From: https://github.com/jlu5/SupyPlugins/commit/2db371a9fad73f15e13c36d2d45d4f8baeaa9938 2017-01-21 07:11:49 +01:00
DDG: strip Sponsored links from search results From: https://github.com/jlu5/SupyPlugins/commit/e9edceb73551c738dc38fc5c1556269c2f8ad83d 2015-08-23 03:32:04 +02:00			`# Remove "sponsored link" results`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`return (url, real_url, parser.results)`
DDG: Initial addition of 'zeroclick' command (Closes #24) From: https://github.com/jlu5/SupyPlugins/commit/285948a47d2159250ff26abe945e499115323263 2015-02-09 04:15:46 +01:00
DDG: abstract out a search_core(), as the LastFM plugin will use this later From: https://github.com/jlu5/SupyPlugins/commit/47bb74d7d95b67a8e2fd8a5e5f2210a41f1b11a1 2017-01-21 03:39:42 +01:00			`def search_core(self, text, channel_context=None, max_results=None, show_snippet=None):`
			`"""`
			`Core results fetcher for the DDG plugin. Other plugins can call this as well via`
			`irc.getCallback('DDG').search_core(...)`
			`"""`
			`if show_snippet is None:`
DDG: fix a typo in comments From: https://github.com/jlu5/SupyPlugins/commit/37290686334d4253f4f1c501cdc97167ccb4c5e1 2017-01-21 03:51:00 +01:00			`# Note: don't use ternary here, or the registry value will override any False`
DDG: abstract out a search_core(), as the LastFM plugin will use this later From: https://github.com/jlu5/SupyPlugins/commit/47bb74d7d95b67a8e2fd8a5e5f2210a41f1b11a1 2017-01-21 03:39:42 +01:00			`# settings given to the function directly.`
			`show_snippet = self.registryValue("showSnippet", channel_context)`
			`maxr = max_results or self.registryValue("maxResults", channel_context)`
DDG: rewrite to fix "max results" not working From: https://github.com/jlu5/SupyPlugins/commit/7a6e2f9f972cf3d08e3a3aefbbfd405aab525d5f 2017-01-21 03:50:16 +01:00			`self.log.debug('DDG: got %s for max results', maxr)`
DDG: Initial addition of 'zeroclick' command (Closes #24) From: https://github.com/jlu5/SupyPlugins/commit/285948a47d2159250ff26abe945e499115323263 2015-02-09 04:15:46 +01:00
			`# In a nutshell, the 'lite' site puts all of its usable content`
DDG: rewrite _ddgurl() to return new and original request URLs This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725d2e6f2f417419d2a0459982898b877 or later From: https://github.com/jlu5/SupyPlugins/commit/2db371a9fad73f15e13c36d2d45d4f8baeaa9938 2017-01-21 07:11:49 +01:00			`# into tables. This does mean that headings, result snippets and`
			`# everything else are all using the same tag (<td>), so parsing is`
			`# still somewhat tricky.`
DDG: abstract out a search_core(), as the LastFM plugin will use this later From: https://github.com/jlu5/SupyPlugins/commit/47bb74d7d95b67a8e2fd8a5e5f2210a41f1b11a1 2017-01-21 03:39:42 +01:00			`results = []`

DDG: rewrite _ddgurl() to return new and original request URLs This uses utils.web.getUrlTargetAndContent(), which is specific to Limnoria and requires commit ProgVal/Limnoria@57b77a6725d2e6f2f417419d2a0459982898b877 or later From: https://github.com/jlu5/SupyPlugins/commit/2db371a9fad73f15e13c36d2d45d4f8baeaa9938 2017-01-21 07:11:49 +01:00			`url, real_url, raw_results = self._ddgurl(text)`

DDG: support !bang links (redirects) From: https://github.com/jlu5/SupyPlugins/commit/1e7bfed72e04de31f88598ec0f333459b4417fdf 2017-01-21 07:16:05 +01:00			`if real_url != url:`
			`# We received a redirect, likely from something like a !bang request.`
			`# Don't bother parsing the target page, as it probably won't work anyways.`
			`return [('', '', real_url)]`

DDG: rewrite to fix "max results" not working From: https://github.com/jlu5/SupyPlugins/commit/7a6e2f9f972cf3d08e3a3aefbbfd405aab525d5f 2017-01-21 03:50:16 +01:00			`for t in raw_results:`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`if self.registryValue("showsnippet", channel_context):`
			`snippet = t.snippet.strip()`
			`else:`
DDG: rewrite to fix "max results" not working From: https://github.com/jlu5/SupyPlugins/commit/7a6e2f9f972cf3d08e3a3aefbbfd405aab525d5f 2017-01-21 03:50:16 +01:00			`snippet = ''`
DDG: Rewrite using html.parser.HTMLParser instead of BeautifulSoup. So we don't depend on an external library. 2020-05-15 21:26:44 +02:00			`title = t.title.strip()`
			`origlink = link = t.link`

			`# As of 2017-01-20, some links on DuckDuckGo's site are shown going through`
			`# a redirect service. The links are in the format "/l/?kh=-1&uddg=https%3A%2F%2Fduckduckgo.com%2F"`
			`# instead of simply being "https://duckduckgo.com". So, we decode these links here.`
			`if link.startswith('/l/'):`
			`linkparse = utils.web.urlparse(link)`
			`try:`
			`link = parse_qs(linkparse.query)['uddg'][0]`
			`except KeyError:`
			`# No link was given here, skip.`
			`continue`
			`except IndexError:`
			`self.log.exception("DDG: failed to expand redirected result URL %s", origlink)`
			`continue`
			`else:`
			`self.log.debug("DDG: expanded result URL from %s to %s", origlink, link)`

			`# Return a list of tuples in the form (link title, snippet text, link)`
			`results.append((title, snippet, link))`
DDG: rewrite to fix "max results" not working From: https://github.com/jlu5/SupyPlugins/commit/7a6e2f9f972cf3d08e3a3aefbbfd405aab525d5f 2017-01-21 03:50:16 +01:00			`return results[:maxr]`
DDG: abstract out a search_core(), as the LastFM plugin will use this later From: https://github.com/jlu5/SupyPlugins/commit/47bb74d7d95b67a8e2fd8a5e5f2210a41f1b11a1 2017-01-21 03:39:42 +01:00
			`@wrap(['text'])`
			`def search(self, irc, msg, args, text):`
			`"""<text>`

			`Searches for <text> on DuckDuckGo's web search."""`
			`results = self.search_core(text, msg.args[0])`
			`if not results:`
			`irc.error("No results found.")`
DDG: fix code handling, remove regex parsing From: https://github.com/jlu5/SupyPlugins/commit/64c29496dfbe4918a76fa71b5efbfd6b67c9bbfd 2014-12-16 02:28:06 +01:00			`else:`
DDG: support !bang links (redirects) From: https://github.com/jlu5/SupyPlugins/commit/1e7bfed72e04de31f88598ec0f333459b4417fdf 2017-01-21 07:16:05 +01:00			`strings = []`

			`for r in results:`
			`if not r[0]:`
			`# This result has no title, so it's likely a redirect from !bang.`
			`strings.append(format("See %u", r[2]))`
			`else:`
			`strings.append(format("%s - %s %u", ircutils.bold(r[0]), r[1], r[2]))`

DDG: actually, return a list of tuples in search_core() This abstraction makes it easier for depending plugins to filter out the data they actually need. From: https://github.com/jlu5/SupyPlugins/commit/4915b7713b062c8a7661159c81b99bfa94280e1e 2017-01-21 03:57:55 +01:00			`irc.reply(', '.join(strings))`
DDG: First commit From: https://github.com/jlu5/SupyPlugins/commit/f16f666ebd3233d83bfc2f2ba4bed9427b0749f1 2014-12-11 03:59:11 +01:00
			`Class = DDG`


			`# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:`