diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py index 6b88800be..b42dcc39e 100644 --- a/plugins/Web/plugin.py +++ b/plugins/Web/plugin.py @@ -210,8 +210,15 @@ class Web(callbacks.PluginRegexp): irc.error("This url is not on the whitelist.") return size = conf.supybot.protocols.http.peekSize() - s = utils.web.getUrl(url, size=size) \ - .decode('utf8') + fd = utils.web.getUrlFd(url) + content_type = fd.getheader('Content-type', 'text/html') \ + .split(';', 1)[0] + if content_type not in ('text/html', 'application/xhtml+xml', + 'application/xhtml'): + irc.error(_('This is not an HTML page (content type is %r)') % + content_type) + return + s = fd.read(size).decode('utf8') m = self._doctypeRe.search(s) if m: s = utils.str.normalizeWhitespace(m.group(0)) @@ -264,7 +271,15 @@ class Web(callbacks.PluginRegexp): irc.error("This url is not on the whitelist.") return size = conf.supybot.protocols.http.peekSize() - text = utils.web.getUrl(url, size=size) + fd = utils.web.getUrlFd(url) + content_type = fd.getheader('Content-type', 'text/html') \ + .split(';', 1)[0] + if content_type not in ('text/html', 'application/xhtml+xml', + 'application/xhtml'): + irc.error(_('This is not an HTML page (content type is %r)') % + content_type) + return + text = fd.read(size) try: text = text.decode(utils.web.getEncoding(text) or 'utf8', 'replace') diff --git a/plugins/Web/test.py b/plugins/Web/test.py index d60ba9d5f..63f94208a 100644 --- a/plugins/Web/test.py +++ b/plugins/Web/test.py @@ -40,6 +40,8 @@ class WebTestCase(ChannelPluginTestCase): def testDoctype(self): self.assertError('doctype ftp://ftp.cdrom.com/pub/linux') self.assertNotError('doctype http://www.slashdot.org/') + self.assertRegexp('doctype http://www.google.com/favicon.ico', + 'Error.*not an HTML page') m = self.getMsg('doctype http://moobot.sf.net/') self.failUnless(m.args[1].endswith('>')) @@ -51,6 +53,8 @@ class WebTestCase(ChannelPluginTestCase): def testTitle(self): self.assertRegexp('title http://www.slashdot.org/', 'News for nerds, stuff that matters') + self.assertRegexp('doctype http://www.google.com/favicon.ico', + 'Error.*not an HTML page') # Checks for @title not-working correctly self.assertResponse('title ' 'http://www.catb.org/~esr/jargon/html/F/foo.html',