Limnoria/others/google.py
2003-03-12 06:26:59 +00:00

433 lines
17 KiB
Python

"""Python wrapper for Google web APIs
This module allows you to access Google's web APIs through SOAP,
to do things like search Google and get the results programmatically.
Described here:
http://www.google.com/apis/
You need a Google-provided license key to use these services.
Follow the link above to get one. These functions will look in
several places (in this order) for the license key:
- the "license_key" argument of each function
- the module-level LICENSE_KEY variable (call setLicense once to set it)
- an environment variable called GOOGLE_LICENSE_KEY
- a file called ".googlekey" in the current directory
- a file called "googlekey.txt" in the current directory
- a file called ".googlekey" in your home directory
- a file called "googlekey.txt" in your home directory
- a file called ".googlekey" in the same directory as google.py
- a file called "googlekey.txt" in the same directory as google.py
Sample usage:
>>> import google
>>> google.setLicense('...') # must get your own key!
>>> data = google.doGoogleSearch('python')
>>> data.meta.searchTime
0.043221000000000002
>>> data.results[0].URL
'http://www.python.org/'
>>> data.results[0].title
'<b>Python</b> Language Website'
See documentation of SearchResultsMetaData and SearchResult classes
for other available attributes.
"""
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
__version__ = "0.5.2"
__cvsversion__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
__copyright__ = "Copyright (c) 2002 Mark Pilgrim"
__license__ = "Python"
__credits__ = """David Ascher, for the install script
Erik Max Francis, for the command line interface
Michael Twomey, for HTTP proxy support"""
import SOAP
import os, sys, getopt
LICENSE_KEY = None
HTTP_PROXY = None
# don't touch the rest of these constants
class NoLicenseKey(Exception): pass
_url = 'http://api.google.com/search/beta2'
_namespace = 'urn:GoogleSearch'
_false = SOAP.booleanType(0)
_true = SOAP.booleanType(1)
_googlefile1 = ".googlekey"
_googlefile2 = "googlekey.txt"
_licenseLocations = (
(lambda key: key, 'passed to the function in license_key variable'),
(lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)'),
(lambda key: os.environ.get('GOOGLE_LICENSE_KEY', None), 'an environment variable called GOOGLE_LICENSE_KEY'),
(lambda key: _contentsOf(os.getcwd(), _googlefile1), '%s in the current directory' % _googlefile1),
(lambda key: _contentsOf(os.getcwd(), _googlefile2), '%s in the current directory' % _googlefile2),
(lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile1), '%s in your home directory' % _googlefile1),
(lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile2), '%s in your home directory' % _googlefile2),
(lambda key: _contentsOf(_getScriptDir(), _googlefile1), '%s in the google.py directory' % _googlefile1),
(lambda key: _contentsOf(_getScriptDir(), _googlefile2), '%s in the google.py directory' % _googlefile2)
)
## administrative functions
def version():
print """PyGoogle %(__version__)s
%(__copyright__)s
released %(__date__)s
Thanks to:
%(__credits__)s""" % globals()
def usage():
program = os.path.basename(sys.argv[0])
print """Usage: %(program)s [options] [querytype] query
options:
-k, --key= <license key> Google license key (see important note below)
-1, -l, --lucky show only first hit
-m, --meta show meta information
-r, --reverse show results in reverse order
-x, --proxy= <url> use HTTP proxy
-h, --help print this help
-v, --version print version and copyright information
-t, --test run test queries
querytype:
-s, --search= <query> search (default)
-c, --cache= <url> retrieve cached page
-p, --spelling= <word> check spelling
IMPORTANT NOTE: all Google functions require a valid license key;
visit http://www.google.com/apis/ to get one. %(program)s will look in
these places (in order) and use the first license key it finds:
* the key specified on the command line""" % vars()
for get, location in _licenseLocations[2:]:
print " *", location
## utility functions
def setLicense(license_key):
"""set license key"""
global LICENSE_KEY
LICENSE_KEY = license_key
def getLicense(license_key = None):
"""get license key
license key can come from any number of locations;
see module docs for search order"""
for get, location in _licenseLocations:
rc = get(license_key)
if rc: return rc
usage()
raise NoLicenseKey, 'get a license key at http://www.google.com/apis/'
def setProxy(http_proxy):
"""set HTTP proxy"""
global HTTP_PROXY
HTTP_PROXY = http_proxy
def getProxy(http_proxy = None):
"""get HTTP proxy"""
return http_proxy or HTTP_PROXY
def _contentsOf(dirname, filename):
filename = os.path.join(dirname, filename)
if not os.path.exists(filename): return None
fsock = open(filename)
contents = fsock.read()
fsock.close()
return contents
def _getScriptDir():
if __name__ == '__main__':
return os.path.abspath(os.path.dirname(sys.argv[0]))
else:
return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__))
def _marshalBoolean(value):
if value:
return _true
else:
return _false
## output formatters
def makeFormatter(outputFormat):
classname = "%sOutputFormatter" % outputFormat.capitalize()
return globals()[classname]()
def output(results, params):
formatter = makeFormatter(params.get("outputFormat", "text"))
outputmethod = getattr(formatter, params["func"])
outputmethod(results, params)
class OutputFormatter:
def boil(self, data):
if type(data) == type(u""):
return data.encode("ISO-8859-1", "replace")
else:
return data
class TextOutputFormatter(OutputFormatter):
def common(self, data, params):
if params.get("showMeta", 0):
meta = data.meta
for category in meta.directoryCategories:
print "directoryCategory: %s" % self.boil(category["fullViewableName"])
for attr in [node for node in dir(meta) if node <> "directoryCategories" and node[:2] <> '__']:
print "%s:" % attr, self.boil(getattr(meta, attr))
def doGoogleSearch(self, data, params):
results = data.results
if params.get("feelingLucky", 0):
results = results[:1]
if params.get("reverseOrder", 0):
results.reverse()
for result in results:
for attr in dir(result):
if attr == "directoryCategory":
print "directoryCategory:", self.boil(result.directoryCategory["fullViewableName"])
elif attr[:2] <> '__':
print "%s:" % attr, self.boil(getattr(result, attr))
print
self.common(data, params)
def doGetCachedPage(self, data, params):
print data
self.common(data, params)
doSpellingSuggestion = doGetCachedPage
## search results classes
class _SearchBase:
def __init__(self, params):
for k, v in params.items():
if isinstance(v, SOAP.structType):
v = v._asdict
try:
if isinstance(v[0], SOAP.structType):
v = [node._asdict for node in v]
except:
pass
self.__dict__[str(k)] = v
class SearchResultsMetaData(_SearchBase):
"""metadata of search query results
Available attributes:
documentFiltering - flag indicates whether duplicate page filtering was perfomed in this search
searchComments - human-readable informational message (example: "'the' is a very common word
and was not included in your search")
estimatedTotalResultsCount - estimated total number of results for this query
estimateIsExact - flag indicates whether estimatedTotalResultsCount is an exact value
searchQuery - search string that initiated this search
startIndex - index of first result returned (zero-based)
endIndex - index of last result returned (zero-based)
searchTips - human-readable informational message on how to use Google bette
directoryCategories - list of dictionaries like this:
{'fullViewableName': Open Directory category,
'specialEncoding': encoding scheme of this directory category}
searchTime - total search time, in seconds
"""
pass
class SearchResult(_SearchBase):
"""search result
Available attributes:
URL - URL
title - title (HTML)
snippet - snippet showing query context (HTML)
cachedSize - size of cached version of this result, (KB)
relatedInformationPresent - flag indicates that the "related:" keyword is supported for this URL
hostName: When filtering occurs, a maximum of two results from any given host is returned.
When this occurs, the second resultElement that comes from that host contains
the host name in this parameter.
directoryCategory: dictionary like this:
{'fullViewableName': Open Directory category,
'specialEncoding': encoding scheme of this directory category}
directoryTitle: Open Directory title of this result (or blank)
summary - Open Directory summary for this result (or blank)
"""
pass
class SearchReturnValue:
"""complete search results for a single query
Available attributes:
meta - SearchResultsMetaData
results - list of SearchResult
"""
def __init__(self, metadata, results):
self.meta = metadata
self.results = results
## main functions
def doGoogleSearch(q, start=0, maxResults=10, filter=1, restrict='',
safeSearch=0, language='', inputencoding='', outputencoding='',
license_key = None, http_proxy = None):
"""search Google
You need a license key to call this function; see
http://www.google.com/apis/ to get one. Then you can either pass it to
this function every time, or set it globally; see the module docs for details.
Parameters:
q - search string. Anything you could type at google.com, you can pass here.
See http://www.google.com/help/features.html for examples of advanced features.
start (optional) - zero-based index of first desired result (for paging through
multiple pages of results)
maxResults (optional) - maximum number of results, currently capped at 10
filter (optional) - set to 1 to filter out similar results, set to 0 to see everything
restrict (optional) - restrict results by country or topic. Examples:
Ukraine - search only sites located in Ukraine
linux - search Linux sites only
mac - search Mac sites only
bsd - search FreeBSD sites only
See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
for more advanced examples and a full list of country codes and topics.
safeSearch (optional) - set to 1 to filter results with SafeSearch (no adult material)
language (optional) - restricts search to documents in one or more languages. Example:
lang_en - only return pages in English
lang_fr - only return pages in French
See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
for more advanced examples and a full list of language codes.
inputencoding (optional) - sets the character encoding of q parameter
outputencoding (optional) - sets the character encoding of the returned results
See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
for a full list of encodings.
http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
Returns: SearchReturnValue
.meta - SearchMetaData
.results - list of SearchResult
See documentation of these individual classes for list of available attributes
"""
http_proxy = getProxy(http_proxy)
remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
license_key = getLicense(license_key)
filter = _marshalBoolean(filter)
safeSearch = _marshalBoolean(safeSearch)
data = remoteserver.doGoogleSearch(license_key, q, start, maxResults, filter, restrict,
safeSearch, language, inputencoding, outputencoding)
metadata = data._asdict
del metadata["resultElements"]
metadata = SearchResultsMetaData(metadata)
results = [SearchResult(node._asdict) for node in data.resultElements]
return SearchReturnValue(metadata, results)
def doGetCachedPage(url, license_key = None, http_proxy = None):
"""get page from Google cache
You need a license key to call this function; see
http://www.google.com/apis/ to get one. Then you can either pass it to
this function every time, or set it globally; see the module docs for details.
Parameters:
url - address of page to get
license_key (optional) - Google license key
http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
Returns: string, text of cached page
"""
http_proxy = getProxy(http_proxy)
remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
license_key = getLicense(license_key)
return remoteserver.doGetCachedPage(license_key, url)
def doSpellingSuggestion(phrase, license_key = None, http_proxy = None):
"""get spelling suggestions from Google
You need a license key to call this function; see
http://www.google.com/apis/ to get one. Then you can either pass it to
this function every time, or set it globally; see the module docs for details.
Parameters:
phrase - word or phrase to spell-check
http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
Returns: text of suggested replacement, or None
"""
http_proxy = getProxy(http_proxy)
remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
license_key = getLicense(license_key)
return remoteserver.doSpellingSuggestion(license_key, phrase)
## functional test suite (see googletest.py for unit test suite)
def test():
try:
getLicense(None)
except NoLicenseKey:
return
print "Searching for Python at google.com..."
data = doGoogleSearch("Python")
output(data, {"func": "doGoogleSearch"})
print "\nSearching for 5 _French_ pages about Python, encoded in ISO-8859-1..."
data = doGoogleSearch("Python", language='lang_fr', outputencoding='ISO-8859-1', maxResults=5)
output(data, {"func": "doGoogleSearch"})
phrase = "Pyhton programming languager"
print "\nTesting spelling suggetions for '%s'..." % phrase
data = doSpellingSuggestion(phrase)
output(data, {"func": "doSpellingSuggestion"})
## main driver for command-line use
def main(argv):
if not argv:
usage()
return
q = None
func = None
http_proxy = None
license_key = None
feelingLucky = 0
showMeta = 0
reverseOrder = 0
runTest = 0
outputFormat = "text"
try:
opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1",
["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-s", "--search"):
q = arg
func = "doGoogleSearch"
elif opt in ("-c", "--cache"):
q = arg
func = "doGetCachedPage"
elif opt in ("-p", "--spelling"):
q = arg
func = "doSpellingSuggestion"
elif opt in ("-k", "--key"):
license_key = arg
elif opt in ("-l", "-1", "--lucky"):
feelingLucky = 1
elif opt in ("-m", "--meta"):
showMeta = 1
elif opt in ("-r", "--reverse"):
reverseOrder = 1
elif opt in ("-x", "--proxy"):
http_proxy = arg
elif opt in ("-h", "--help"):
usage()
elif opt in ("-v", "--version"):
version()
elif opt in ("-t", "--test"):
runTest = 1
if runTest:
setLicense(license_key)
setProxy(http_proxy)
test()
if args and not q:
q = args[0]
func = "doGoogleSearch"
if func:
results = globals()[func](q, http_proxy=http_proxy, license_key=license_key)
output(results, locals())
if __name__ == '__main__':
main(sys.argv[1:])