2005-01-19 14:14:38 +01:00
|
|
|
###
|
2005-01-19 14:33:05 +01:00
|
|
|
# Copyright (c) 2002-2005, Jeremiah Fincher
|
2012-09-01 16:16:48 +02:00
|
|
|
# Copyright (c) 2009, James McCoy
|
2005-01-19 14:14:38 +01:00
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
|
|
|
import re
|
2012-10-20 20:23:32 +02:00
|
|
|
import base64
|
2005-01-19 14:14:38 +01:00
|
|
|
import socket
|
|
|
|
|
2010-05-24 21:44:25 +02:00
|
|
|
sockerrors = (socket.error,)
|
|
|
|
try:
|
|
|
|
sockerrors += (socket.sslerror,)
|
|
|
|
except AttributeError:
|
|
|
|
pass
|
|
|
|
|
2012-09-18 04:12:11 +02:00
|
|
|
from .str import normalizeWhitespace
|
2015-08-11 16:50:23 +02:00
|
|
|
from . import minisix
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2015-08-10 17:55:25 +02:00
|
|
|
if minisix.PY2:
|
|
|
|
import urllib
|
|
|
|
import urllib2
|
|
|
|
from httplib import InvalidURL
|
|
|
|
from urlparse import urlsplit, urlunsplit, urlparse
|
|
|
|
from htmlentitydefs import entitydefs, name2codepoint
|
|
|
|
from HTMLParser import HTMLParser
|
2019-10-10 17:27:34 +02:00
|
|
|
from cgi import escape as html_escape
|
2015-08-10 17:55:25 +02:00
|
|
|
Request = urllib2.Request
|
|
|
|
urlquote = urllib.quote
|
|
|
|
urlquote_plus = urllib.quote_plus
|
|
|
|
urlunquote = urllib.unquote
|
|
|
|
urlopen = urllib2.urlopen
|
|
|
|
def urlencode(*args, **kwargs):
|
|
|
|
return urllib.urlencode(*args, **kwargs).encode()
|
|
|
|
from urllib2 import HTTPError, URLError
|
|
|
|
from urllib import splithost, splituser
|
|
|
|
else:
|
|
|
|
from http.client import InvalidURL
|
|
|
|
from urllib.parse import urlsplit, urlunsplit, urlparse
|
|
|
|
from html.entities import entitydefs, name2codepoint
|
|
|
|
from html.parser import HTMLParser
|
2019-10-10 17:27:34 +02:00
|
|
|
from html import escape as html_escape
|
2015-08-10 17:55:25 +02:00
|
|
|
import urllib.request, urllib.parse, urllib.error
|
|
|
|
Request = urllib.request.Request
|
|
|
|
urlquote = urllib.parse.quote
|
|
|
|
urlquote_plus = urllib.parse.quote_plus
|
|
|
|
urlunquote = urllib.parse.unquote
|
|
|
|
urlopen = urllib.request.urlopen
|
|
|
|
def urlencode(*args, **kwargs):
|
2015-09-09 22:02:19 +02:00
|
|
|
return urllib.parse.urlencode(*args, **kwargs)
|
2015-08-10 17:55:25 +02:00
|
|
|
from urllib.error import HTTPError, URLError
|
|
|
|
from urllib.parse import splithost, splituser
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2005-01-27 07:59:08 +01:00
|
|
|
class Error(Exception):
|
2005-01-19 14:14:38 +01:00
|
|
|
pass
|
|
|
|
|
2009-10-16 03:56:26 +02:00
|
|
|
_octet = r'(?:2(?:[0-4]\d|5[0-5])|1\d\d|\d{1,2})'
|
|
|
|
_ipAddr = r'%s(?:\.%s){3}' % (_octet, _octet)
|
2008-12-09 07:28:37 +01:00
|
|
|
# Base domain regex off RFC 1034 and 1738
|
2009-10-16 03:56:26 +02:00
|
|
|
_label = r'[0-9a-z][-0-9a-z]*[0-9a-z]?'
|
2016-02-07 09:44:08 +01:00
|
|
|
_scheme = r'[a-z][a-z0-9+.-]*'
|
2011-01-02 13:22:54 +01:00
|
|
|
_domain = r'%s(?:\.%s)*\.[0-9a-z][-0-9a-z]+' % (_label, _label)
|
2016-02-07 09:44:08 +01:00
|
|
|
_urlRe = r'(%s://(?:\S+@)?(?:%s|%s)(?::\d+)?(?:/[^\])>\s]*)?)' % (
|
|
|
|
_scheme, _domain, _ipAddr)
|
2009-10-16 03:56:26 +02:00
|
|
|
urlRe = re.compile(_urlRe, re.I)
|
2020-06-27 20:23:08 +02:00
|
|
|
_httpUrlRe = r'(https?://(?:\S+@)?(?:%s|%s)(?::\d+)?(?:/[^\]>\s]*)?)' % \
|
2012-10-20 20:23:32 +02:00
|
|
|
(_domain, _ipAddr)
|
2009-10-16 03:56:26 +02:00
|
|
|
httpUrlRe = re.compile(_httpUrlRe, re.I)
|
2005-01-19 14:14:38 +01:00
|
|
|
|
|
|
|
REFUSED = 'Connection refused.'
|
|
|
|
TIMED_OUT = 'Connection timed out.'
|
|
|
|
UNKNOWN_HOST = 'Unknown host.'
|
|
|
|
RESET_BY_PEER = 'Connection reset by peer.'
|
|
|
|
FORBIDDEN = 'Client forbidden from accessing URL.'
|
|
|
|
|
|
|
|
def strError(e):
|
|
|
|
try:
|
|
|
|
n = e.args[0]
|
|
|
|
except Exception:
|
|
|
|
return str(e)
|
|
|
|
if n == 111:
|
|
|
|
return REFUSED
|
|
|
|
elif n in (110, 10060):
|
|
|
|
return TIMED_OUT
|
|
|
|
elif n == 104:
|
|
|
|
return RESET_BY_PEER
|
2005-10-14 14:59:47 +02:00
|
|
|
elif n in (8, 7, 3, 2, -2, -3):
|
2005-01-19 14:14:38 +01:00
|
|
|
return UNKNOWN_HOST
|
|
|
|
elif n == 403:
|
|
|
|
return FORBIDDEN
|
|
|
|
else:
|
|
|
|
return str(e)
|
|
|
|
|
2020-01-14 19:03:12 +01:00
|
|
|
# Used to build defaultHeaders
|
|
|
|
baseDefaultHeaders = {
|
2005-01-27 07:59:08 +01:00
|
|
|
'User-agent': 'Mozilla/5.0 (compatible; utils.web python module)'
|
2005-01-19 14:14:38 +01:00
|
|
|
}
|
|
|
|
|
2020-01-14 19:03:12 +01:00
|
|
|
# overridable by other modules/plugins.
|
|
|
|
defaultHeaders = baseDefaultHeaders.copy()
|
|
|
|
|
2005-01-27 07:59:08 +01:00
|
|
|
# Other modules should feel free to replace this with an appropriate
|
|
|
|
# application-specific function. Feel free to use a callable here.
|
|
|
|
proxy = None
|
|
|
|
|
2011-06-13 22:42:57 +02:00
|
|
|
def getUrlFd(url, headers=None, data=None, timeout=None):
|
2014-04-26 14:43:45 +02:00
|
|
|
"""getUrlFd(url, headers=None, data=None, timeout=None)
|
2009-09-16 07:17:33 +02:00
|
|
|
|
|
|
|
Opens the given url and returns a file object. Headers and data are
|
2015-08-10 17:55:25 +02:00
|
|
|
a dict and string, respectively, as per urllib.request.Request's
|
|
|
|
arguments."""
|
2005-01-19 14:14:38 +01:00
|
|
|
if headers is None:
|
2005-01-27 07:59:08 +01:00
|
|
|
headers = defaultHeaders
|
2015-08-09 00:23:03 +02:00
|
|
|
if minisix.PY3 and isinstance(data, str):
|
2014-07-16 07:41:33 +02:00
|
|
|
data = data.encode()
|
2005-01-19 14:14:38 +01:00
|
|
|
try:
|
2015-08-10 17:55:25 +02:00
|
|
|
if not isinstance(url, Request):
|
|
|
|
(scheme, loc, path, query, frag) = urlsplit(url)
|
|
|
|
(user, host) = splituser(loc)
|
|
|
|
url = urlunsplit((scheme, host, path, query, ''))
|
|
|
|
request = Request(url, headers=headers, data=data)
|
2012-10-23 23:06:30 +02:00
|
|
|
if user:
|
2012-10-23 18:15:13 +02:00
|
|
|
request.add_header('Authorization',
|
2012-10-23 23:06:30 +02:00
|
|
|
'Basic %s' % base64.b64encode(user))
|
2005-01-19 14:14:38 +01:00
|
|
|
else:
|
|
|
|
request = url
|
2009-09-16 07:17:33 +02:00
|
|
|
request.add_data(data)
|
2015-08-10 17:55:25 +02:00
|
|
|
fd = urlopen(request, timeout=timeout)
|
2005-01-19 14:14:38 +01:00
|
|
|
return fd
|
2014-01-20 15:49:15 +01:00
|
|
|
except socket.timeout as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(TIMED_OUT)
|
2014-01-20 15:49:15 +01:00
|
|
|
except sockerrors as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(strError(e))
|
2015-08-10 17:55:25 +02:00
|
|
|
except InvalidURL as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error('Invalid URL: %s' % e)
|
2015-08-10 17:55:25 +02:00
|
|
|
except HTTPError as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(strError(e))
|
2015-08-10 17:55:25 +02:00
|
|
|
except URLError as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(strError(e.reason))
|
2005-01-19 14:14:38 +01:00
|
|
|
# Raised when urllib doesn't recognize the url type
|
2014-01-20 15:49:15 +01:00
|
|
|
except ValueError as e:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(strError(e))
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2015-10-25 16:19:49 +01:00
|
|
|
def getUrlTargetAndContent(url, size=None, headers=None, data=None, timeout=None):
|
|
|
|
"""getUrlTargetAndContent(url, size=None, headers=None, data=None, timeout=None)
|
2009-09-16 07:17:33 +02:00
|
|
|
|
2015-10-25 16:19:49 +01:00
|
|
|
Gets a page. Returns two strings that are the page gotten and the
|
|
|
|
target URL (ie. after redirections). Size is an integer
|
2009-09-16 07:17:33 +02:00
|
|
|
number of bytes to read from the URL. Headers and data are dicts as per
|
2015-08-10 17:55:25 +02:00
|
|
|
urllib.request.Request's arguments."""
|
2014-04-26 14:43:45 +02:00
|
|
|
fd = getUrlFd(url, headers=headers, data=data, timeout=timeout)
|
2005-01-19 14:14:38 +01:00
|
|
|
try:
|
|
|
|
if size is None:
|
|
|
|
text = fd.read()
|
|
|
|
else:
|
|
|
|
text = fd.read(size)
|
2015-08-31 15:38:35 +02:00
|
|
|
except socket.timeout:
|
2014-01-20 15:43:55 +01:00
|
|
|
raise Error(TIMED_OUT)
|
2015-10-25 16:19:49 +01:00
|
|
|
target = fd.geturl()
|
2005-01-19 14:14:38 +01:00
|
|
|
fd.close()
|
2015-10-25 16:19:49 +01:00
|
|
|
return (target, text)
|
|
|
|
|
|
|
|
def getUrlContent(*args, **kwargs):
|
|
|
|
"""getUrlContent(url, size=None, headers=None, data=None, timeout=None)
|
|
|
|
|
|
|
|
Gets a page. Returns a string that is the page gotten. Size is an integer
|
|
|
|
number of bytes to read from the URL. Headers and data are dicts as per
|
|
|
|
urllib.request.Request's arguments."""
|
|
|
|
(target, text) = getUrlTargetAndContent(*args, **kwargs)
|
2005-01-19 14:14:38 +01:00
|
|
|
return text
|
|
|
|
|
2015-10-25 16:19:49 +01:00
|
|
|
def getUrl(*args, **kwargs):
|
|
|
|
"""Alias for getUrlContent."""
|
|
|
|
return getUrlContent(*args, **kwargs)
|
|
|
|
|
2005-01-19 14:14:38 +01:00
|
|
|
def getDomain(url):
|
2015-08-10 17:55:25 +02:00
|
|
|
return urlparse(url)[1]
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2013-07-09 14:40:42 +02:00
|
|
|
_charset_re = ('<meta[^a-z<>]+charset='
|
|
|
|
"""(?P<charset>("[^"]+"|'[^']+'))""")
|
2013-07-09 14:05:51 +02:00
|
|
|
def getEncoding(s):
|
2013-07-09 14:40:42 +02:00
|
|
|
try:
|
|
|
|
match = re.search(_charset_re, s, re.MULTILINE)
|
|
|
|
if match:
|
|
|
|
return match.group('charset')[1:-1]
|
|
|
|
except:
|
|
|
|
match = re.search(_charset_re.encode(), s, re.MULTILINE)
|
|
|
|
if match:
|
|
|
|
return match.group('charset').decode()[1:-1]
|
|
|
|
|
2013-07-09 14:05:51 +02:00
|
|
|
try:
|
|
|
|
import charade.universaldetector
|
|
|
|
u = charade.universaldetector.UniversalDetector()
|
|
|
|
u.feed(s)
|
|
|
|
u.close()
|
|
|
|
return u.result['encoding']
|
|
|
|
except:
|
|
|
|
return None
|
|
|
|
|
2012-08-04 18:02:45 +02:00
|
|
|
class HtmlToText(HTMLParser, object):
|
2005-01-27 07:59:08 +01:00
|
|
|
"""Taken from some eff-bot code on c.l.p."""
|
2015-08-10 17:55:25 +02:00
|
|
|
entitydefs = entitydefs.copy()
|
2005-01-27 07:59:08 +01:00
|
|
|
entitydefs['nbsp'] = ' '
|
2015-10-23 07:41:36 +02:00
|
|
|
entitydefs['apos'] = '\''
|
2005-01-27 07:59:08 +01:00
|
|
|
def __init__(self, tagReplace=' '):
|
|
|
|
self.data = []
|
|
|
|
self.tagReplace = tagReplace
|
2012-08-04 18:02:45 +02:00
|
|
|
super(HtmlToText, self).__init__()
|
2005-01-27 07:59:08 +01:00
|
|
|
|
2015-10-23 07:41:36 +02:00
|
|
|
def append(self, data):
|
|
|
|
self.data.append(data)
|
|
|
|
|
2012-08-04 18:02:45 +02:00
|
|
|
def handle_starttag(self, tag, attr):
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(self.tagReplace)
|
2005-01-27 07:59:08 +01:00
|
|
|
|
2012-08-04 18:02:45 +02:00
|
|
|
def handle_endtag(self, tag):
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(self.tagReplace)
|
2005-01-27 07:59:08 +01:00
|
|
|
|
|
|
|
def handle_data(self, data):
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(data)
|
2005-01-27 07:59:08 +01:00
|
|
|
|
2013-02-01 20:50:46 +01:00
|
|
|
def handle_entityref(self, data):
|
2015-08-10 18:52:51 +02:00
|
|
|
if minisix.PY3:
|
|
|
|
if data in name2codepoint:
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(chr(name2codepoint[data]))
|
2015-08-10 18:52:51 +02:00
|
|
|
elif isinstance(data, bytes):
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(data.decode())
|
2015-08-10 18:52:51 +02:00
|
|
|
else:
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(data)
|
2013-11-26 16:15:07 +01:00
|
|
|
else:
|
2015-08-10 18:52:51 +02:00
|
|
|
if data in name2codepoint:
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(unichr(name2codepoint[data]))
|
2015-08-10 18:52:51 +02:00
|
|
|
elif isinstance(data, str):
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(data.decode('utf8', errors='replace'))
|
2015-08-10 18:52:51 +02:00
|
|
|
else:
|
2015-10-23 07:41:36 +02:00
|
|
|
self.append(data)
|
2013-02-01 20:50:46 +01:00
|
|
|
|
2005-01-27 07:59:08 +01:00
|
|
|
def getText(self):
|
|
|
|
text = ''.join(self.data).strip()
|
|
|
|
return normalizeWhitespace(text)
|
|
|
|
|
2015-10-23 07:41:36 +02:00
|
|
|
def handle_charref(self, name):
|
2016-03-08 22:00:44 +01:00
|
|
|
self.append(self.unescape('&#%s;' % name))
|
2015-10-23 07:41:36 +02:00
|
|
|
|
2013-07-09 14:02:25 +02:00
|
|
|
def htmlToText(s, tagReplace=' '):
|
|
|
|
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
|
|
|
"""
|
2013-07-09 14:05:51 +02:00
|
|
|
encoding = getEncoding(s)
|
2013-07-09 14:02:25 +02:00
|
|
|
if encoding:
|
|
|
|
s = s.decode(encoding)
|
|
|
|
else:
|
2013-06-13 19:22:33 +02:00
|
|
|
try:
|
2015-08-09 00:23:03 +02:00
|
|
|
if minisix.PY2 or isinstance(s, bytes):
|
2013-06-13 19:22:33 +02:00
|
|
|
s = s.decode('utf8')
|
|
|
|
except:
|
|
|
|
pass
|
2005-01-27 07:59:08 +01:00
|
|
|
x = HtmlToText(tagReplace)
|
|
|
|
x.feed(s)
|
2015-10-22 15:56:53 +02:00
|
|
|
x.close()
|
2005-01-27 07:59:08 +01:00
|
|
|
return x.getText()
|
|
|
|
|
|
|
|
def mungeEmail(s):
|
|
|
|
s = s.replace('@', ' AT ')
|
|
|
|
s = s.replace('.', ' DOT ')
|
|
|
|
return s
|
|
|
|
|
2013-07-09 14:02:25 +02:00
|
|
|
|
2006-02-11 16:52:51 +01:00
|
|
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2013-06-17 06:10:29 +02:00
|
|
|
|