Web: create a cofigurable url whitelist

Prevent various forms of abuse that result via the Web plugin, such as fetching or titling malicious content, or revealing bot IP. Conflicts: plugins/Web/plugin.py plugins/Web/test.py
2026-01-25 20:17:57 +01:00 · 2013-05-11 14:11:57 -04:00 · 2013-05-11 14:11:57 -04:00 · 944f9c3e3f
commit 944f9c3e3f
parent c27070895b
3 changed files with 52 additions and 1 deletions
--- a/plugins/Web/config.py
+++ b/plugins/Web/config.py
@ -57,6 +57,12 @@ conf.registerChannelValue(Web, 'nonSnarfingRegexp',
    will not be snarfed.  Give the empty string if you have no URLs that you'd
    like to exclude from being snarfed.""")))

+conf.registerGlobalValue(Web, 'urlWhitelist',
+    registry.SpaceSeparatedListOfStrings([], """If set, bot will only fetch data
+    from urls in the whitelist, i.e. starting with http://domain/optionalpath/. This will
+    apply to all commands that retrieve data from user-supplied URLs,
+    including fetch, headers, title, doctype."""))
+
 conf.registerGroup(Web, 'fetch')
 conf.registerGlobalValue(Web.fetch, 'maximum',
    registry.NonNegativeInteger(0, _("""Determines the maximum number of
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@ -119,6 +119,19 @@ class Web(callbacks.PluginRegexp):
    titleSnarfer = urlSnarfer(titleSnarfer)
    titleSnarfer.__doc__ = utils.web._httpUrlRe

+    def _checkURLWhitelist(self, url):
+        if not self.registryValue('urlWhitelist'):
+            return True
+        passed = False
+        for wu in self.registryValue('urlWhitelist'):
+            if wu.endswith('/') and url.find(wu) == 0:
+                passed = True
+                break
+            if (not wu.endswith('/')) and (url.find(wu + '/') == 0 or url == wu):
+                passed = True
+                break
+        return passed
+
    @internationalizeDocstring
    def headers(self, irc, msg, args, url):
        """<url>
@ -126,6 +139,9 @@ class Web(callbacks.PluginRegexp):
        Returns the HTTP headers of <url>.  Only HTTP urls are valid, of
        course.
        """
+        if not self._checkURLWhitelist(url):
+            irc.error("This url is not on the whitelist.")
+            return
        fd = utils.web.getUrlFd(url)
        try:
            s = ', '.join([format(_('%s: %s'), k, v)
@ -143,6 +159,9 @@ class Web(callbacks.PluginRegexp):
        Returns the DOCTYPE string of <url>.  Only HTTP urls are valid, of
        course.
        """
+        if not self._checkURLWhitelist(url):
+            irc.error("This url is not on the whitelist.")
+            return
        size = conf.supybot.protocols.http.peekSize()
        s = utils.web.getUrl(url, size=size) \
                        .decode('utf8')
@ -161,6 +180,9 @@ class Web(callbacks.PluginRegexp):
        Returns the Content-Length header of <url>.  Only HTTP urls are valid,
        of course.
        """
+        if not self._checkURLWhitelist(url):
+            irc.error("This url is not on the whitelist.")
+            return
        fd = utils.web.getUrlFd(url)
        try:
            try:
@ -187,6 +209,9 @@ class Web(callbacks.PluginRegexp):
        If --no-filter is given, the bot won't strip special chars (action,
        DCC, ...).
        """
+        if not self._checkURLWhitelist(url):
+            irc.error("This url is not on the whitelist.")
+            return
        size = conf.supybot.protocols.http.peekSize()
        text = utils.web.getUrl(url, size=size)
        try:
@ -239,6 +264,9 @@ class Web(callbacks.PluginRegexp):
        supybot.plugins.Web.fetch.maximum.  If that configuration variable is
        set to 0, this command will be effectively disabled.
        """
+        if not self._checkURLWhitelist(url):
+            irc.error("This url is not on the whitelist.")
+            return
        max = self.registryValue('fetch.maximum')
        if not max:
            irc.error(_('This command is disabled '
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@ -75,7 +75,7 @@ class WebTestCase(ChannelPluginTestCase):
            try:
                conf.supybot.plugins.Web.titleSnarfer.setValue(True)
                self.assertSnarfRegexp('http://microsoft.com/',
-                                       'Title: Microsoft')
+                                         'Microsoft')
            finally:
                conf.supybot.plugins.Web.titleSnarfer.setValue(False)

@ -102,5 +102,22 @@ class WebTestCase(ChannelPluginTestCase):
        finally:
            conf.supybot.plugins.Web.nonSnarfingRegexp.set('')

+    def testWhitelist(self):
+        fm = conf.supybot.plugins.Web.fetch.maximum()
+        uw = conf.supybot.plugins.Web.urlWhitelist()
+        try:
+            conf.supybot.plugins.Web.fetch.maximum.set(1024)
+            self.assertNotError('web fetch http://fsf.org')
+            conf.supybot.plugins.Web.urlWhitelist.set('http://slashdot.org')
+            self.assertError('web fetch http://fsf.org')
+            self.assertError('wef title http://fsf.org')
+            self.assertError('web fetch http://slashdot.org.evildomain.com')
+            self.assertNotError('web fetch http://slashdot.org')
+            self.assertNotError('web fetch http://slashdot.org/recent')
+            conf.supybot.plugins.Web.urlWhitelist.set('http://slashdot.org http://fsf.org')
+            self.assertNotError('doctype http://fsf.org')
+        finally:
+            conf.supybot.plugins.Web.urlWhitelist.set('')
+            conf.supybot.plugins.Web.fetch.maximum.set(fm)

 # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: