Limnoria/src/utils/web.py

276 lines
9.0 KiB
Python
Raw Normal View History

2005-01-19 14:14:38 +01:00
###
2005-01-19 14:33:05 +01:00
# Copyright (c) 2002-2005, Jeremiah Fincher
# Copyright (c) 2009, James McCoy
2005-01-19 14:14:38 +01:00
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
import re
import base64
2005-01-19 14:14:38 +01:00
import socket
sockerrors = (socket.error,)
try:
sockerrors += (socket.sslerror,)
except AttributeError:
pass
from .str import normalizeWhitespace
2015-08-11 16:50:23 +02:00
from . import minisix
2005-01-19 14:14:38 +01:00
if minisix.PY2:
import urllib
import urllib2
from httplib import InvalidURL
from urlparse import urlsplit, urlunsplit, urlparse
from htmlentitydefs import entitydefs, name2codepoint
from HTMLParser import HTMLParser
Request = urllib2.Request
urlquote = urllib.quote
urlquote_plus = urllib.quote_plus
urlunquote = urllib.unquote
urlopen = urllib2.urlopen
def urlencode(*args, **kwargs):
return urllib.urlencode(*args, **kwargs).encode()
from urllib2 import HTTPError, URLError
from urllib import splithost, splituser
else:
from http.client import InvalidURL
from urllib.parse import urlsplit, urlunsplit, urlparse
from html.entities import entitydefs, name2codepoint
from html.parser import HTMLParser
import urllib.request, urllib.parse, urllib.error
Request = urllib.request.Request
urlquote = urllib.parse.quote
urlquote_plus = urllib.parse.quote_plus
urlunquote = urllib.parse.unquote
urlopen = urllib.request.urlopen
def urlencode(*args, **kwargs):
return urllib.parse.urlencode(*args, **kwargs)
from urllib.error import HTTPError, URLError
from urllib.parse import splithost, splituser
2005-01-19 14:14:38 +01:00
class Error(Exception):
2005-01-19 14:14:38 +01:00
pass
_octet = r'(?:2(?:[0-4]\d|5[0-5])|1\d\d|\d{1,2})'
_ipAddr = r'%s(?:\.%s){3}' % (_octet, _octet)
# Base domain regex off RFC 1034 and 1738
_label = r'[0-9a-z][-0-9a-z]*[0-9a-z]?'
2011-01-02 13:22:54 +01:00
_domain = r'%s(?:\.%s)*\.[0-9a-z][-0-9a-z]+' % (_label, _label)
_urlRe = r'(\w+://(?:\S+@)?(?:%s|%s)(?::\d+)?(?:/[^\])>\s]*)?)' % (_domain,
_ipAddr)
urlRe = re.compile(_urlRe, re.I)
_httpUrlRe = r'(https?://(?:\S+@)?(?:%s|%s)(?::\d+)?(?:/[^\])>\s]*)?)' % \
(_domain, _ipAddr)
httpUrlRe = re.compile(_httpUrlRe, re.I)
2005-01-19 14:14:38 +01:00
REFUSED = 'Connection refused.'
TIMED_OUT = 'Connection timed out.'
UNKNOWN_HOST = 'Unknown host.'
RESET_BY_PEER = 'Connection reset by peer.'
FORBIDDEN = 'Client forbidden from accessing URL.'
def strError(e):
try:
n = e.args[0]
except Exception:
return str(e)
if n == 111:
return REFUSED
elif n in (110, 10060):
return TIMED_OUT
elif n == 104:
return RESET_BY_PEER
elif n in (8, 7, 3, 2, -2, -3):
2005-01-19 14:14:38 +01:00
return UNKNOWN_HOST
elif n == 403:
return FORBIDDEN
else:
return str(e)
defaultHeaders = {
'User-agent': 'Mozilla/5.0 (compatible; utils.web python module)'
2005-01-19 14:14:38 +01:00
}
# Other modules should feel free to replace this with an appropriate
# application-specific function. Feel free to use a callable here.
proxy = None
def getUrlFd(url, headers=None, data=None, timeout=None):
"""getUrlFd(url, headers=None, data=None, timeout=None)
Opens the given url and returns a file object. Headers and data are
a dict and string, respectively, as per urllib.request.Request's
arguments."""
2005-01-19 14:14:38 +01:00
if headers is None:
headers = defaultHeaders
if minisix.PY3 and isinstance(data, str):
data = data.encode()
2005-01-19 14:14:38 +01:00
try:
if not isinstance(url, Request):
(scheme, loc, path, query, frag) = urlsplit(url)
(user, host) = splituser(loc)
url = urlunsplit((scheme, host, path, query, ''))
request = Request(url, headers=headers, data=data)
if user:
2012-10-23 18:15:13 +02:00
request.add_header('Authorization',
'Basic %s' % base64.b64encode(user))
2005-01-19 14:14:38 +01:00
else:
request = url
request.add_data(data)
httpProxy = force(proxy)
2005-01-19 14:14:38 +01:00
if httpProxy:
request.set_proxy(httpProxy, 'http')
fd = urlopen(request, timeout=timeout)
2005-01-19 14:14:38 +01:00
return fd
except socket.timeout as e:
raise Error(TIMED_OUT)
except sockerrors as e:
raise Error(strError(e))
except InvalidURL as e:
raise Error('Invalid URL: %s' % e)
except HTTPError as e:
raise Error(strError(e))
except URLError as e:
raise Error(strError(e.reason))
2005-01-19 14:14:38 +01:00
# Raised when urllib doesn't recognize the url type
except ValueError as e:
raise Error(strError(e))
2005-01-19 14:14:38 +01:00
def getUrl(url, size=None, headers=None, data=None, timeout=None):
"""getUrl(url, size=None, headers=None, data=None, timeout=None)
Gets a page. Returns a string that is the page gotten. Size is an integer
number of bytes to read from the URL. Headers and data are dicts as per
urllib.request.Request's arguments."""
fd = getUrlFd(url, headers=headers, data=data, timeout=timeout)
2005-01-19 14:14:38 +01:00
try:
if size is None:
text = fd.read()
else:
text = fd.read(size)
except socket.timeout:
raise Error(TIMED_OUT)
2005-01-19 14:14:38 +01:00
fd.close()
return text
def getDomain(url):
return urlparse(url)[1]
2005-01-19 14:14:38 +01:00
_charset_re = ('<meta[^a-z<>]+charset='
"""(?P<charset>("[^"]+"|'[^']+'))""")
def getEncoding(s):
try:
match = re.search(_charset_re, s, re.MULTILINE)
if match:
return match.group('charset')[1:-1]
except:
match = re.search(_charset_re.encode(), s, re.MULTILINE)
if match:
return match.group('charset').decode()[1:-1]
try:
import charade.universaldetector
u = charade.universaldetector.UniversalDetector()
u.feed(s)
u.close()
return u.result['encoding']
except:
return None
class HtmlToText(HTMLParser, object):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = entitydefs.copy()
entitydefs['nbsp'] = ' '
entitydefs['apos'] = '\''
def __init__(self, tagReplace=' '):
self.data = []
self.tagReplace = tagReplace
super(HtmlToText, self).__init__()
def append(self, data):
self.data.append(data)
def handle_starttag(self, tag, attr):
self.append(self.tagReplace)
def handle_endtag(self, tag):
self.append(self.tagReplace)
def handle_data(self, data):
self.append(data)
def handle_entityref(self, data):
2015-08-10 18:52:51 +02:00
if minisix.PY3:
if data in name2codepoint:
self.append(chr(name2codepoint[data]))
2015-08-10 18:52:51 +02:00
elif isinstance(data, bytes):
self.append(data.decode())
2015-08-10 18:52:51 +02:00
else:
self.append(data)
else:
2015-08-10 18:52:51 +02:00
if data in name2codepoint:
self.append(unichr(name2codepoint[data]))
2015-08-10 18:52:51 +02:00
elif isinstance(data, str):
self.append(data.decode('utf8', errors='replace'))
2015-08-10 18:52:51 +02:00
else:
self.append(data)
def getText(self):
text = ''.join(self.data).strip()
return normalizeWhitespace(text)
def handle_charref(self, name):
self.append((unichr if minisix.PY2 else chr)(int(name)))
2013-07-09 14:02:25 +02:00
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
encoding = getEncoding(s)
2013-07-09 14:02:25 +02:00
if encoding:
s = s.decode(encoding)
else:
try:
if minisix.PY2 or isinstance(s, bytes):
s = s.decode('utf8')
except:
pass
x = HtmlToText(tagReplace)
x.feed(s)
x.close()
return x.getText()
def mungeEmail(s):
s = s.replace('@', ' AT ')
s = s.replace('.', ' DOT ')
return s
2013-07-09 14:02:25 +02:00
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
2005-01-19 14:14:38 +01:00