2005-02-01 10:41:54 +01:00
|
|
|
###
|
|
|
|
# Copyright (c) 2005, Jeremiah Fincher
|
2012-09-01 16:16:48 +02:00
|
|
|
# Copyright (c) 2009, James McCoy
|
2005-02-01 10:41:54 +01:00
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
|
|
|
import re
|
2012-09-22 19:34:33 +02:00
|
|
|
import sys
|
2006-09-13 21:40:51 +02:00
|
|
|
import HTMLParser
|
2005-07-19 15:55:37 +02:00
|
|
|
import htmlentitydefs
|
2005-04-30 14:53:42 +02:00
|
|
|
|
2005-02-01 10:41:54 +01:00
|
|
|
import supybot.conf as conf
|
|
|
|
import supybot.utils as utils
|
|
|
|
from supybot.commands import *
|
|
|
|
import supybot.plugins as plugins
|
|
|
|
import supybot.ircutils as ircutils
|
|
|
|
import supybot.callbacks as callbacks
|
2010-10-20 09:39:44 +02:00
|
|
|
from supybot.i18n import PluginInternationalization, internationalizeDocstring
|
|
|
|
_ = PluginInternationalization('Web')
|
2005-02-01 10:41:54 +01:00
|
|
|
|
2006-09-13 21:40:51 +02:00
|
|
|
class Title(HTMLParser.HTMLParser):
|
2005-07-19 15:55:37 +02:00
|
|
|
entitydefs = htmlentitydefs.entitydefs.copy()
|
|
|
|
entitydefs['nbsp'] = ' '
|
2006-09-13 21:40:51 +02:00
|
|
|
entitydefs['apos'] = '\''
|
2005-07-19 15:55:37 +02:00
|
|
|
def __init__(self):
|
2005-04-30 14:53:42 +02:00
|
|
|
self.inTitle = False
|
2005-07-19 15:55:37 +02:00
|
|
|
self.title = ''
|
2006-09-13 21:40:51 +02:00
|
|
|
HTMLParser.HTMLParser.__init__(self)
|
2005-04-30 14:53:42 +02:00
|
|
|
|
2006-09-13 21:40:51 +02:00
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == 'title':
|
|
|
|
self.inTitle = True
|
2005-04-30 14:53:42 +02:00
|
|
|
|
2006-09-13 21:40:51 +02:00
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag == 'title':
|
|
|
|
self.inTitle = False
|
2005-07-19 15:55:37 +02:00
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self.inTitle:
|
|
|
|
self.title += data
|
2005-04-30 14:53:42 +02:00
|
|
|
|
2006-09-13 21:40:51 +02:00
|
|
|
def handle_entityref(self, name):
|
|
|
|
if self.inTitle:
|
|
|
|
if name in self.entitydefs:
|
|
|
|
self.title += self.entitydefs[name]
|
|
|
|
|
2005-02-09 08:04:04 +01:00
|
|
|
class Web(callbacks.PluginRegexp):
|
2005-02-01 10:41:54 +01:00
|
|
|
"""Add the help for "@help Web" here."""
|
|
|
|
threaded = True
|
2005-03-14 03:44:55 +01:00
|
|
|
regexps = ['titleSnarfer']
|
2005-03-09 08:26:32 +01:00
|
|
|
def callCommand(self, command, irc, msg, *args, **kwargs):
|
|
|
|
try:
|
|
|
|
super(Web, self).callCommand(command, irc, msg, *args, **kwargs)
|
|
|
|
except utils.web.Error, e:
|
|
|
|
irc.reply(str(e))
|
2005-04-30 14:53:42 +02:00
|
|
|
|
2005-02-01 10:41:54 +01:00
|
|
|
def titleSnarfer(self, irc, msg, match):
|
|
|
|
channel = msg.args[0]
|
|
|
|
if not irc.isChannel(channel):
|
|
|
|
return
|
|
|
|
if callbacks.addressed(irc.nick, msg):
|
|
|
|
return
|
|
|
|
if self.registryValue('titleSnarfer', channel):
|
|
|
|
url = match.group(0)
|
|
|
|
r = self.registryValue('nonSnarfingRegexp', channel)
|
|
|
|
if r and r.search(url):
|
|
|
|
self.log.debug('Not titleSnarfing %q.', url)
|
|
|
|
return
|
|
|
|
try:
|
|
|
|
size = conf.supybot.protocols.http.peekSize()
|
2012-11-17 16:10:36 +01:00
|
|
|
text = utils.web.getUrl(url, size=size)
|
2005-02-01 10:41:54 +01:00
|
|
|
except utils.web.Error, e:
|
|
|
|
self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
|
2013-04-04 00:53:52 +02:00
|
|
|
if self.registryValue('snarferReportIOExceptions', channel):
|
|
|
|
irc.reply(url+" : "+utils.web.strError(e), prefixNick=False)
|
2005-02-01 10:41:54 +01:00
|
|
|
return
|
2013-07-02 13:42:53 +02:00
|
|
|
try:
|
2013-07-09 14:02:43 +02:00
|
|
|
text = text.decode(utils.web.get_encoding(text) or 'utf8',
|
|
|
|
'replace')
|
2013-07-02 13:42:53 +02:00
|
|
|
except:
|
|
|
|
pass
|
2005-04-30 14:53:42 +02:00
|
|
|
parser = Title()
|
2005-05-07 05:24:10 +02:00
|
|
|
try:
|
2012-05-11 18:10:23 +02:00
|
|
|
parser.feed(text)
|
2007-10-04 14:57:00 +02:00
|
|
|
except HTMLParser.HTMLParseError:
|
2005-06-29 21:05:20 +02:00
|
|
|
self.log.debug('Encountered a problem parsing %u. Title may '
|
|
|
|
'already be set, though', url)
|
2005-07-19 15:55:37 +02:00
|
|
|
if parser.title:
|
2005-02-01 10:41:54 +01:00
|
|
|
domain = utils.web.getDomain(url)
|
2012-05-11 18:10:23 +02:00
|
|
|
title = utils.web.htmlToText(parser.title.strip())
|
2012-09-22 19:34:33 +02:00
|
|
|
if sys.version_info[0] < 3:
|
2013-01-05 18:02:35 +01:00
|
|
|
try:
|
|
|
|
title = title.encode('utf8', 'replace')
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
pass
|
2011-10-26 11:25:51 +02:00
|
|
|
s = format(_('Title: %s (at %s)'), title, domain)
|
2005-06-01 23:08:30 +02:00
|
|
|
irc.reply(s, prefixNick=False)
|
2005-02-01 10:41:54 +01:00
|
|
|
titleSnarfer = urlSnarfer(titleSnarfer)
|
2009-10-16 03:56:26 +02:00
|
|
|
titleSnarfer.__doc__ = utils.web._httpUrlRe
|
2005-02-01 10:41:54 +01:00
|
|
|
|
2013-05-11 20:11:57 +02:00
|
|
|
def _checkURLWhitelist(self, url):
|
|
|
|
if not self.registryValue('urlWhitelist'):
|
|
|
|
return True
|
|
|
|
passed = False
|
|
|
|
for wu in self.registryValue('urlWhitelist'):
|
|
|
|
if wu.endswith('/') and url.find(wu) == 0:
|
|
|
|
passed = True
|
|
|
|
break
|
|
|
|
if (not wu.endswith('/')) and (url.find(wu + '/') == 0 or url == wu):
|
|
|
|
passed = True
|
|
|
|
break
|
|
|
|
return passed
|
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-02-01 10:41:54 +01:00
|
|
|
def headers(self, irc, msg, args, url):
|
|
|
|
"""<url>
|
|
|
|
|
|
|
|
Returns the HTTP headers of <url>. Only HTTP urls are valid, of
|
|
|
|
course.
|
|
|
|
"""
|
2013-05-11 20:11:57 +02:00
|
|
|
if not self._checkURLWhitelist(url):
|
|
|
|
irc.error("This url is not on the whitelist.")
|
|
|
|
return
|
2005-02-01 10:41:54 +01:00
|
|
|
fd = utils.web.getUrlFd(url)
|
|
|
|
try:
|
2010-10-20 09:39:44 +02:00
|
|
|
s = ', '.join([format(_('%s: %s'), k, v)
|
2005-02-01 10:41:54 +01:00
|
|
|
for (k, v) in fd.headers.items()])
|
|
|
|
irc.reply(s)
|
|
|
|
finally:
|
|
|
|
fd.close()
|
|
|
|
headers = wrap(headers, ['httpUrl'])
|
|
|
|
|
|
|
|
_doctypeRe = re.compile(r'(<!DOCTYPE[^>]+>)', re.M)
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-02-01 10:41:54 +01:00
|
|
|
def doctype(self, irc, msg, args, url):
|
|
|
|
"""<url>
|
|
|
|
|
|
|
|
Returns the DOCTYPE string of <url>. Only HTTP urls are valid, of
|
|
|
|
course.
|
|
|
|
"""
|
2013-05-11 20:11:57 +02:00
|
|
|
if not self._checkURLWhitelist(url):
|
|
|
|
irc.error("This url is not on the whitelist.")
|
|
|
|
return
|
2005-02-01 10:41:54 +01:00
|
|
|
size = conf.supybot.protocols.http.peekSize()
|
2012-08-04 22:23:40 +02:00
|
|
|
s = utils.web.getUrl(url, size=size) \
|
2012-09-19 17:08:52 +02:00
|
|
|
.decode('utf8')
|
2005-02-01 10:41:54 +01:00
|
|
|
m = self._doctypeRe.search(s)
|
|
|
|
if m:
|
|
|
|
s = utils.str.normalizeWhitespace(m.group(0))
|
|
|
|
irc.reply(s)
|
|
|
|
else:
|
2010-10-20 09:39:44 +02:00
|
|
|
irc.reply(_('That URL has no specified doctype.'))
|
2005-02-01 10:41:54 +01:00
|
|
|
doctype = wrap(doctype, ['httpUrl'])
|
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-02-01 10:41:54 +01:00
|
|
|
def size(self, irc, msg, args, url):
|
|
|
|
"""<url>
|
|
|
|
|
|
|
|
Returns the Content-Length header of <url>. Only HTTP urls are valid,
|
|
|
|
of course.
|
|
|
|
"""
|
2013-05-11 20:11:57 +02:00
|
|
|
if not self._checkURLWhitelist(url):
|
|
|
|
irc.error("This url is not on the whitelist.")
|
|
|
|
return
|
2005-02-01 10:41:54 +01:00
|
|
|
fd = utils.web.getUrlFd(url)
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
size = fd.headers['Content-Length']
|
2011-07-01 17:07:54 +02:00
|
|
|
irc.reply(format(_('%u is %S long.'), url, int(size)))
|
2005-02-01 10:41:54 +01:00
|
|
|
except KeyError:
|
|
|
|
size = conf.supybot.protocols.http.peekSize()
|
|
|
|
s = fd.read(size)
|
|
|
|
if len(s) != size:
|
2010-10-23 10:38:52 +02:00
|
|
|
irc.reply(format(_('%u is %S long.'), url, len(s)))
|
2005-02-01 10:41:54 +01:00
|
|
|
else:
|
2010-10-20 09:39:44 +02:00
|
|
|
irc.reply(format(_('The server didn\'t tell me how long %u '
|
2010-10-23 10:38:52 +02:00
|
|
|
'is but it\'s longer than %S.'),
|
2005-02-01 10:41:54 +01:00
|
|
|
url, size))
|
|
|
|
finally:
|
|
|
|
fd.close()
|
|
|
|
size = wrap(size, ['httpUrl'])
|
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2012-10-31 17:35:51 +01:00
|
|
|
def title(self, irc, msg, args, optlist, url):
|
|
|
|
"""[--no-filter] <url>
|
2005-02-01 10:41:54 +01:00
|
|
|
|
|
|
|
Returns the HTML <title>...</title> of a URL.
|
2012-10-31 17:35:51 +01:00
|
|
|
If --no-filter is given, the bot won't strip special chars (action,
|
|
|
|
DCC, ...).
|
2005-02-01 10:41:54 +01:00
|
|
|
"""
|
2013-05-11 20:11:57 +02:00
|
|
|
if not self._checkURLWhitelist(url):
|
|
|
|
irc.error("This url is not on the whitelist.")
|
|
|
|
return
|
2005-02-01 10:41:54 +01:00
|
|
|
size = conf.supybot.protocols.http.peekSize()
|
2012-11-17 16:10:36 +01:00
|
|
|
text = utils.web.getUrl(url, size=size)
|
2013-03-06 13:11:46 +01:00
|
|
|
try:
|
2013-07-09 14:02:43 +02:00
|
|
|
text = text.decode(utils.web.get_encoding(text) or 'utf8',
|
|
|
|
'replace')
|
2013-03-06 13:11:46 +01:00
|
|
|
except:
|
|
|
|
pass
|
2005-04-30 14:53:42 +02:00
|
|
|
parser = Title()
|
2005-05-07 05:24:10 +02:00
|
|
|
try:
|
2012-05-11 18:10:23 +02:00
|
|
|
parser.feed(text)
|
2006-09-13 21:40:51 +02:00
|
|
|
except HTMLParser.HTMLParseError:
|
2005-06-29 21:05:20 +02:00
|
|
|
self.log.debug('Encountered a problem parsing %u. Title may '
|
|
|
|
'already be set, though', url)
|
2005-07-19 15:55:37 +02:00
|
|
|
if parser.title:
|
2012-10-31 17:35:51 +01:00
|
|
|
title = utils.web.htmlToText(parser.title.strip())
|
|
|
|
if not [y for x,y in optlist if x == 'no-filter']:
|
|
|
|
for i in range(1, 4):
|
|
|
|
title = title.replace(chr(i), '')
|
|
|
|
irc.reply(title)
|
2008-09-24 18:30:31 +02:00
|
|
|
elif len(text) < size:
|
2010-10-20 09:39:44 +02:00
|
|
|
irc.reply(_('That URL appears to have no HTML title.'))
|
2005-02-01 10:41:54 +01:00
|
|
|
else:
|
2010-10-20 09:39:44 +02:00
|
|
|
irc.reply(format(_('That URL appears to have no HTML title '
|
2010-10-23 10:38:52 +02:00
|
|
|
'within the first %S.'), size))
|
2012-10-31 17:35:51 +01:00
|
|
|
title = wrap(title, [getopts({'no-filter': ''}), 'httpUrl'])
|
2005-02-01 10:41:54 +01:00
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-02-01 10:41:54 +01:00
|
|
|
def urlquote(self, irc, msg, args, text):
|
|
|
|
"""<text>
|
|
|
|
|
|
|
|
Returns the URL quoted form of the text.
|
|
|
|
"""
|
|
|
|
irc.reply(utils.web.urlquote(text))
|
|
|
|
urlquote = wrap(urlquote, ['text'])
|
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-02-01 10:41:54 +01:00
|
|
|
def urlunquote(self, irc, msg, args, text):
|
|
|
|
"""<text>
|
|
|
|
|
|
|
|
Returns the text un-URL quoted.
|
|
|
|
"""
|
|
|
|
s = utils.web.urlunquote(text)
|
|
|
|
irc.reply(s)
|
|
|
|
urlunquote = wrap(urlunquote, ['text'])
|
|
|
|
|
2010-10-20 09:39:44 +02:00
|
|
|
@internationalizeDocstring
|
2005-03-14 03:44:55 +01:00
|
|
|
def fetch(self, irc, msg, args, url):
|
|
|
|
"""<url>
|
2005-02-01 10:41:54 +01:00
|
|
|
|
2005-03-14 03:44:55 +01:00
|
|
|
Returns the contents of <url>, or as much as is configured in
|
|
|
|
supybot.plugins.Web.fetch.maximum. If that configuration variable is
|
|
|
|
set to 0, this command will be effectively disabled.
|
|
|
|
"""
|
2013-05-11 20:11:57 +02:00
|
|
|
if not self._checkURLWhitelist(url):
|
|
|
|
irc.error("This url is not on the whitelist.")
|
|
|
|
return
|
2005-03-14 03:44:55 +01:00
|
|
|
max = self.registryValue('fetch.maximum')
|
|
|
|
if not max:
|
2010-10-20 09:39:44 +02:00
|
|
|
irc.error(_('This command is disabled '
|
|
|
|
'(supybot.plugins.Web.fetch.maximum is set to 0).'),
|
2005-03-14 03:44:55 +01:00
|
|
|
Raise=True)
|
2012-10-02 18:19:53 +02:00
|
|
|
fd = utils.web.getUrl(url, size=max) \
|
2012-09-19 17:08:52 +02:00
|
|
|
.decode('utf8')
|
2012-10-02 18:19:53 +02:00
|
|
|
irc.reply(fd)
|
2005-03-14 03:44:55 +01:00
|
|
|
fetch = wrap(fetch, ['url'])
|
2005-02-01 10:41:54 +01:00
|
|
|
|
|
|
|
Class = Web
|
|
|
|
|
2006-02-11 16:52:51 +01:00
|
|
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|