Limnoria/plugins/Web/plugin.py

###
# Copyright (c) 2005, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###

import re
import HTMLParser
import htmlentitydefs

import supybot.conf as conf
import supybot.utils as utils
from supybot.commands import *
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks

class Title(HTMLParser.HTMLParser):
    entitydefs = htmlentitydefs.entitydefs.copy()
    entitydefs['nbsp'] = ' '
    entitydefs['apos'] = '\''
    def __init__(self):
        self.inTitle = False
        self.title = ''
        HTMLParser.HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.inTitle = True

    def handle_endtag(self, tag):
        if tag == 'title':
            self.inTitle = False

    def handle_data(self, data):
        if self.inTitle:
            self.title += data

    def handle_entityref(self, name):
        if self.inTitle:
            if name in self.entitydefs:
                self.title += self.entitydefs[name]

class Web(callbacks.PluginRegexp):
    """Add the help for "@help Web" here."""
    threaded = True
    regexps = ['titleSnarfer']
    def callCommand(self, command, irc, msg, *args, **kwargs):
        try:
            super(Web, self).callCommand(command, irc, msg, *args, **kwargs)
        except utils.web.Error, e:
            irc.reply(str(e))

    def titleSnarfer(self, irc, msg, match):
        r"https?://[^\])>\s]+"
        channel = msg.args[0]
        if not irc.isChannel(channel):
            return
        if callbacks.addressed(irc.nick, msg):
            return
        if self.registryValue('titleSnarfer', channel):
            url = match.group(0)
            r = self.registryValue('nonSnarfingRegexp', channel)
            if r and r.search(url):
                self.log.debug('Not titleSnarfing %q.', url)
                return
            try:
                size = conf.supybot.protocols.http.peekSize()
                text = utils.web.getUrl(url, size=size)
            except utils.web.Error, e:
                self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
                return
            parser = Title()
            try:
                parser.feed(text)
            except sgmllib.SGMLParseError:
                self.log.debug('Encountered a problem parsing %u.  Title may '
                               'already be set, though', url)
            if parser.title:
                domain = utils.web.getDomain(url)
                title = utils.web.htmlToText(parser.title.strip())
                s = format('Title: %s (at %s)', title, domain)
                irc.reply(s, prefixNick=False)
    titleSnarfer = urlSnarfer(titleSnarfer)

    def headers(self, irc, msg, args, url):
        """<url>

        Returns the HTTP headers of <url>.  Only HTTP urls are valid, of
        course.
        """
        fd = utils.web.getUrlFd(url)
        try:
            s = ', '.join([format('%s: %s', k, v)
                           for (k, v) in fd.headers.items()])
            irc.reply(s)
        finally:
            fd.close()
    headers = wrap(headers, ['httpUrl'])

    _doctypeRe = re.compile(r'(<!DOCTYPE[^>]+>)', re.M)
    def doctype(self, irc, msg, args, url):
        """<url>

        Returns the DOCTYPE string of <url>.  Only HTTP urls are valid, of
        course.
        """
        size = conf.supybot.protocols.http.peekSize()
        s = utils.web.getUrl(url, size=size)
        m = self._doctypeRe.search(s)
        if m:
            s = utils.str.normalizeWhitespace(m.group(0))
            irc.reply(s)
        else:
            irc.reply('That URL has no specified doctype.')
    doctype = wrap(doctype, ['httpUrl'])

    def size(self, irc, msg, args, url):
        """<url>

        Returns the Content-Length header of <url>.  Only HTTP urls are valid,
        of course.
        """
        fd = utils.web.getUrlFd(url)
        try:
            try:
                size = fd.headers['Content-Length']
                irc.reply(format('%u is %i bytes long.', url, size))
            except KeyError:
                size = conf.supybot.protocols.http.peekSize()
                s = fd.read(size)
                if len(s) != size:
                    irc.reply(format('%u is %i bytes long.', url, len(s)))
                else:
                    irc.reply(format('The server didn\'t tell me how long %u '
                                     'is but it\'s longer than %i bytes.',
                                     url, size))
        finally:
            fd.close()
    size = wrap(size, ['httpUrl'])

    def title(self, irc, msg, args, url):
        """<url>

        Returns the HTML <title>...</title> of a URL.
        """
        size = conf.supybot.protocols.http.peekSize()
        text = utils.web.getUrl(url, size=size)
        parser = Title()
        try:
            parser.feed(text)
        except HTMLParser.HTMLParseError:
            self.log.debug('Encountered a problem parsing %u.  Title may '
                           'already be set, though', url)
        if parser.title:
            irc.reply(utils.web.htmlToText(parser.title.strip()))
        else:
            irc.reply(format('That URL appears to have no HTML title '
                             'within the first %i bytes.', size))
    title = wrap(title, ['httpUrl'])

    _netcraftre = re.compile(r'td align="left">\s+<a[^>]+>(.*?)<a href',
                             re.S | re.I)
    def netcraft(self, irc, msg, args, hostname):
        """<hostname|ip>

        Returns Netcraft.com's determination of what operating system and
        webserver is running on the host given.
        """
        url = 'http://uptime.netcraft.com/up/graph/?host=' + hostname
        html = utils.web.getUrl(url)
        m = self._netcraftre.search(html)
        if m:
            html = m.group(1)
            s = utils.web.htmlToText(html, tagReplace='').strip()
            s = s.rstrip('-').strip()
            irc.reply(s) # Snip off "the site"
        elif 'We could not get any results' in html:
            irc.reply('No results found for %s.' % hostname)
        else:
            irc.error('The format of page the was odd.')
    netcraft = wrap(netcraft, ['text'])

    def urlquote(self, irc, msg, args, text):
        """<text>

        Returns the URL quoted form of the text.
        """
        irc.reply(utils.web.urlquote(text))
    urlquote = wrap(urlquote, ['text'])

    def urlunquote(self, irc, msg, args, text):
        """<text>

        Returns the text un-URL quoted.
        """
        s = utils.web.urlunquote(text)
        irc.reply(s)
    urlunquote = wrap(urlunquote, ['text'])

    def fetch(self, irc, msg, args, url):
        """<url>

        Returns the contents of <url>, or as much as is configured in
        supybot.plugins.Web.fetch.maximum.  If that configuration variable is
        set to 0, this command will be effectively disabled.
        """
        max = self.registryValue('fetch.maximum')
        if not max:
            irc.error('This command is disabled '
                      '(supybot.plugins.Web.fetch.maximum is set to 0).',
                      Raise=True)
        fd = utils.web.getUrlFd(url)
        irc.reply(fd.read(max))
    fetch = wrap(fetch, ['url'])

Class = Web

# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`###`
			`# Copyright (c) 2005, Jeremiah Fincher`
			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`
			`###`

			`import re`
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`import HTMLParser`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`import htmlentitydefs`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`import supybot.conf as conf`
			`import supybot.utils as utils`
			`from supybot.commands import *`
			`import supybot.plugins as plugins`
			`import supybot.ircutils as ircutils`
			`import supybot.callbacks as callbacks`

plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`class Title(HTMLParser.HTMLParser):`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`entitydefs = htmlentitydefs.entitydefs.copy()`
			`entitydefs['nbsp'] = ' '`
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`entitydefs['apos'] = '\''`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`def __init__(self):`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00			`self.inTitle = False`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`self.title = ''`
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`HTMLParser.HTMLParser.__init__(self)`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`def handle_starttag(self, tag, attrs):`
			`if tag == 'title':`
			`self.inTitle = True`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`def handle_endtag(self, tag):`
			`if tag == 'title':`
			`self.inTitle = False`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00
			`def handle_data(self, data):`
			`if self.inTitle:`
			`self.title += data`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`def handle_entityref(self, name):`
			`if self.inTitle:`
			`if name in self.entitydefs:`
			`self.title += self.entitydefs[name]`

Changed callbacks.Privmsg to be callbacks.Plugin, and callbacks.PrivmsgCommandAndRegexp to be callbacks.Plugin. 2005-02-09 08:04:04 +01:00			`class Web(callbacks.PluginRegexp):`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`"""Add the help for "@help Web" here."""`
			`threaded = True`
Added the Web.fetch command. 2005-03-14 03:44:55 +01:00			`regexps = ['titleSnarfer']`
Added a callCommand to the Web plugin to catch utils.web.Error. 2005-03-09 08:26:32 +01:00			`def callCommand(self, command, irc, msg, args, *kwargs):`
			`try:`
			`super(Web, self).callCommand(command, irc, msg, args, *kwargs)`
			`except utils.web.Error, e:`
			`irc.reply(str(e))`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`def titleSnarfer(self, irc, msg, match):`
			`r"https?://[^\])>\s]+"`
			`channel = msg.args[0]`
			`if not irc.isChannel(channel):`
			`return`
			`if callbacks.addressed(irc.nick, msg):`
			`return`
			`if self.registryValue('titleSnarfer', channel):`
			`url = match.group(0)`
			`r = self.registryValue('nonSnarfingRegexp', channel)`
			`if r and r.search(url):`
			`self.log.debug('Not titleSnarfing %q.', url)`
			`return`
			`try:`
			`size = conf.supybot.protocols.http.peekSize()`
			`text = utils.web.getUrl(url, size=size)`
			`except utils.web.Error, e:`
			`self.log.info('Couldn\'t snarf title of %u: %s.', url, e)`
			`return`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00			`parser = Title()`
Catch HTMLParserErrors when we're trying to grab the <title>. 2005-05-07 05:24:10 +02:00			`try:`
			`parser.feed(text)`
plugins/Web: Update the exception handling for the change in parsers. 2005-09-20 21:06:35 +02:00			`except sgmllib.SGMLParseError:`
plugins/Web: Encountering an HTMLParser exception doesn't mean the title hasn't already been snarfed. Don't bail right away. 2005-06-29 21:05:20 +02:00			`self.log.debug('Encountered a problem parsing %u. Title may '`
			`'already be set, though', url)`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`if parser.title:`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`domain = utils.web.getDomain(url)`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00			`title = utils.web.htmlToText(parser.title.strip())`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`s = format('Title: %s (at %s)', title, domain)`
Changed prefixName to prefixNick, which is more appropriate, and has always bothered me. Better now than later. 2005-06-01 23:08:30 +02:00			`irc.reply(s, prefixNick=False)`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`titleSnarfer = urlSnarfer(titleSnarfer)`

			`def headers(self, irc, msg, args, url):`
			`"""<url>`

			`Returns the HTTP headers of <url>. Only HTTP urls are valid, of`
			`course.`
			`"""`
			`fd = utils.web.getUrlFd(url)`
			`try:`
			`s = ', '.join([format('%s: %s', k, v)`
			`for (k, v) in fd.headers.items()])`
			`irc.reply(s)`
			`finally:`
			`fd.close()`
			`headers = wrap(headers, ['httpUrl'])`

			`_doctypeRe = re.compile(r'(<!DOCTYPE[^>]+>)', re.M)`
			`def doctype(self, irc, msg, args, url):`
			`"""<url>`

			`Returns the DOCTYPE string of <url>. Only HTTP urls are valid, of`
			`course.`
			`"""`
			`size = conf.supybot.protocols.http.peekSize()`
			`s = utils.web.getUrl(url, size=size)`
			`m = self._doctypeRe.search(s)`
			`if m:`
			`s = utils.str.normalizeWhitespace(m.group(0))`
			`irc.reply(s)`
			`else:`
			`irc.reply('That URL has no specified doctype.')`
			`doctype = wrap(doctype, ['httpUrl'])`

			`def size(self, irc, msg, args, url):`
			`"""<url>`

			`Returns the Content-Length header of <url>. Only HTTP urls are valid,`
			`of course.`
			`"""`
			`fd = utils.web.getUrlFd(url)`
			`try:`
			`try:`
			`size = fd.headers['Content-Length']`
			`irc.reply(format('%u is %i bytes long.', url, size))`
			`except KeyError:`
			`size = conf.supybot.protocols.http.peekSize()`
			`s = fd.read(size)`
			`if len(s) != size:`
			`irc.reply(format('%u is %i bytes long.', url, len(s)))`
			`else:`
			`irc.reply(format('The server didn\'t tell me how long %u '`
			`'is but it\'s longer than %i bytes.',`
			`url, size))`
			`finally:`
			`fd.close()`
			`size = wrap(size, ['httpUrl'])`

			`def title(self, irc, msg, args, url):`
			`"""<url>`

			`Returns the HTML <title>...</title> of a URL.`
			`"""`
			`size = conf.supybot.protocols.http.peekSize()`
			`text = utils.web.getUrl(url, size=size)`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00			`parser = Title()`
Catch HTMLParserErrors when we're trying to grab the <title>. 2005-05-07 05:24:10 +02:00			`try:`
			`parser.feed(text)`
plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input. 2006-09-13 21:40:51 +02:00			`except HTMLParser.HTMLParseError:`
plugins/Web: Encountering an HTMLParser exception doesn't mean the title hasn't already been snarfed. Don't bail right away. 2005-06-29 21:05:20 +02:00			`self.log.debug('Encountered a problem parsing %u. Title may '`
			`'already be set, though', url)`
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title. 2005-07-19 15:55:37 +02:00			`if parser.title:`
Bug #1190350, Don't grab fake title. 2005-04-30 14:53:42 +02:00			`irc.reply(utils.web.htmlToText(parser.title.strip()))`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00			`else:`
			`irc.reply(format('That URL appears to have no HTML title '`
			`'within the first %i bytes.', size))`
			`title = wrap(title, ['httpUrl'])`

			`_netcraftre = re.compile(r'td align="left">\s+<a[^>]+>(.*?)<a href',`
			`re.S \| re.I)`
			`def netcraft(self, irc, msg, args, hostname):`
			`"""<hostname\|ip>`

			`Returns Netcraft.com's determination of what operating system and`
			`webserver is running on the host given.`
			`"""`
			`url = 'http://uptime.netcraft.com/up/graph/?host=' + hostname`
			`html = utils.web.getUrl(url)`
			`m = self._netcraftre.search(html)`
			`if m:`
			`html = m.group(1)`
			`s = utils.web.htmlToText(html, tagReplace='').strip()`
			`s = s.rstrip('-').strip()`
			`irc.reply(s) # Snip off "the site"`
			`elif 'We could not get any results' in html:`
			`irc.reply('No results found for %s.' % hostname)`
			`else:`
			`irc.error('The format of page the was odd.')`
			`netcraft = wrap(netcraft, ['text'])`

			`def urlquote(self, irc, msg, args, text):`
			`"""<text>`

			`Returns the URL quoted form of the text.`
			`"""`
			`irc.reply(utils.web.urlquote(text))`
			`urlquote = wrap(urlquote, ['text'])`

			`def urlunquote(self, irc, msg, args, text):`
			`"""<text>`

			`Returns the text un-URL quoted.`
			`"""`
			`s = utils.web.urlunquote(text)`
			`irc.reply(s)`
			`urlunquote = wrap(urlunquote, ['text'])`

Added the Web.fetch command. 2005-03-14 03:44:55 +01:00			`def fetch(self, irc, msg, args, url):`
			`"""<url>`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00
Added the Web.fetch command. 2005-03-14 03:44:55 +01:00			`Returns the contents of <url>, or as much as is configured in`
			`supybot.plugins.Web.fetch.maximum. If that configuration variable is`
			`set to 0, this command will be effectively disabled.`
			`"""`
			`max = self.registryValue('fetch.maximum')`
			`if not max:`
			`irc.error('This command is disabled '`
			`'(supybot.plugins.Web.fetch.maximum is set to 0).',`
			`Raise=True)`
			`fd = utils.web.getUrlFd(url)`
			`irc.reply(fd.read(max))`
			`fetch = wrap(fetch, ['url'])`
Added the Web plugin (from pieces of Http, Fun, and URL) in the new plugin format. 2005-02-01 10:41:54 +01:00
			`Class = Web`

Change the modeline to use softtabstop instead of tabstop. 2006-02-11 16:52:51 +01:00			`# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:`