Web: Disable @title and @doctype for non-HTML documents.

This commit is contained in:
Valentin Lorentz 2013-08-09 18:03:02 +02:00
parent 536ec37037
commit 34b0e5faad
2 changed files with 22 additions and 3 deletions

View File

@ -210,8 +210,15 @@ class Web(callbacks.PluginRegexp):
irc.error("This url is not on the whitelist.") irc.error("This url is not on the whitelist.")
return return
size = conf.supybot.protocols.http.peekSize() size = conf.supybot.protocols.http.peekSize()
s = utils.web.getUrl(url, size=size) \ fd = utils.web.getUrlFd(url)
.decode('utf8') content_type = fd.getheader('Content-type', 'text/html') \
.split(';', 1)[0]
if content_type not in ('text/html', 'application/xhtml+xml',
'application/xhtml'):
irc.error(_('This is not an HTML page (content type is %r)') %
content_type)
return
s = fd.read(size).decode('utf8')
m = self._doctypeRe.search(s) m = self._doctypeRe.search(s)
if m: if m:
s = utils.str.normalizeWhitespace(m.group(0)) s = utils.str.normalizeWhitespace(m.group(0))
@ -264,7 +271,15 @@ class Web(callbacks.PluginRegexp):
irc.error("This url is not on the whitelist.") irc.error("This url is not on the whitelist.")
return return
size = conf.supybot.protocols.http.peekSize() size = conf.supybot.protocols.http.peekSize()
text = utils.web.getUrl(url, size=size) fd = utils.web.getUrlFd(url)
content_type = fd.getheader('Content-type', 'text/html') \
.split(';', 1)[0]
if content_type not in ('text/html', 'application/xhtml+xml',
'application/xhtml'):
irc.error(_('This is not an HTML page (content type is %r)') %
content_type)
return
text = fd.read(size)
try: try:
text = text.decode(utils.web.getEncoding(text) or 'utf8', text = text.decode(utils.web.getEncoding(text) or 'utf8',
'replace') 'replace')

View File

@ -40,6 +40,8 @@ class WebTestCase(ChannelPluginTestCase):
def testDoctype(self): def testDoctype(self):
self.assertError('doctype ftp://ftp.cdrom.com/pub/linux') self.assertError('doctype ftp://ftp.cdrom.com/pub/linux')
self.assertNotError('doctype http://www.slashdot.org/') self.assertNotError('doctype http://www.slashdot.org/')
self.assertRegexp('doctype http://www.google.com/favicon.ico',
'Error.*not an HTML page')
m = self.getMsg('doctype http://moobot.sf.net/') m = self.getMsg('doctype http://moobot.sf.net/')
self.failUnless(m.args[1].endswith('>')) self.failUnless(m.args[1].endswith('>'))
@ -51,6 +53,8 @@ class WebTestCase(ChannelPluginTestCase):
def testTitle(self): def testTitle(self):
self.assertRegexp('title http://www.slashdot.org/', self.assertRegexp('title http://www.slashdot.org/',
'News for nerds, stuff that matters') 'News for nerds, stuff that matters')
self.assertRegexp('doctype http://www.google.com/favicon.ico',
'Error.*not an HTML page')
# Checks for @title not-working correctly # Checks for @title not-working correctly
self.assertResponse('title ' self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html', 'http://www.catb.org/~esr/jargon/html/F/foo.html',