"""
Python wrapper for Google web APIs
This module allows you to access Google's web APIs through SOAP,
to do things like search Google and get the results programmatically.
Described U{here }
You need a Google-provided license key to use these services.
Follow the link above to get one. These functions will look in
several places (in this order) for the license key:
- the "license_key" argument of each function
- the module-level LICENSE_KEY variable (call setLicense once to set it)
- an environment variable called GOOGLE_LICENSE_KEY
- a file called ".googlekey" in the current directory
- a file called "googlekey.txt" in the current directory
- a file called ".googlekey" in your home directory
- a file called "googlekey.txt" in your home directory
- a file called ".googlekey" in the same directory as google.py
- a file called "googlekey.txt" in the same directory as google.py
Sample usage::
>>> import google
>>> google.setLicense('...') # must get your own key!
>>> data = google.doGoogleSearch('python')
>>> data.meta.searchTime
0.043221000000000002
>>> data.results[0].URL
'http://www.python.org/'
>>> data.results[0].title
'Python Language Website'
@newfield contrib: Contributors
@author: Mark Pilgrim
@author: Brian Landers
@license: Python
@version: 0.6
@contrib: David Ascher, for the install script
@contrib: Erik Max Francis, for the command line interface
@contrib: Michael Twomey, for HTTP proxy support
@contrib: Mark Recht, for patches to support SOAPpy
"""
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
__version__ = "0.6"
__cvsversion__ = "$Revision: 1.6 $"[11:-2]
__date__ = "$Date: 2004/09/30 08:09:09 $"[7:-2]
__copyright__ = "Copyright (c) 2002 Mark Pilgrim"
__license__ = "Python"
__credits__ = """David Ascher, for the install script
Erik Max Francis, for the command line interface
Michael Twomey, for HTTP proxy support"""
import os, sys, getopt
import GoogleSOAPFacade
LICENSE_KEY = None
HTTP_PROXY = None
#
# Constants
#
_url = 'http://api.google.com/search/beta2'
_namespace = 'urn:GoogleSearch'
_googlefile1 = ".googlekey"
_googlefile2 = "googlekey.txt"
_false = GoogleSOAPFacade.false
_true = GoogleSOAPFacade.true
_licenseLocations = (
( lambda key: key,
'passed to the function in license_key variable' ),
( lambda key: LICENSE_KEY,
'module-level LICENSE_KEY variable (call setLicense to set it)' ),
( lambda key: os.environ.get( 'GOOGLE_LICENSE_KEY', None ),
'an environment variable called GOOGLE_LICENSE_KEY' ),
( lambda key: _contentsOf( os.getcwd(), _googlefile1 ),
'%s in the current directory' % _googlefile1),
( lambda key: _contentsOf( os.getcwd(), _googlefile2 ),
'%s in the current directory' % _googlefile2),
( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile1 ),
'%s in your home directory' % _googlefile1),
( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile2 ),
'%s in your home directory' % _googlefile2 ),
( lambda key: _contentsOf( _getScriptDir(), _googlefile1 ),
'%s in the google.py directory' % _googlefile1 ),
( lambda key: _contentsOf( _getScriptDir(), _googlefile2 ),
'%s in the google.py directory' % _googlefile2 )
)
## ----------------------------------------------------------------------
## Exceptions
## ----------------------------------------------------------------------
class NoLicenseKey(Exception):
"""
Thrown when the API is unable to find a valid license key.
"""
pass
## ----------------------------------------------------------------------
## administrative functions (non-API)
## ----------------------------------------------------------------------
def _version():
"""
Display a formatted version string for the module
"""
print """PyGoogle %(__version__)s
%(__copyright__)s
released %(__date__)s
Thanks to:
%(__credits__)s""" % globals()
def _usage():
"""
Display usage information for the command-line interface
"""
program = os.path.basename(sys.argv[0])
print """Usage: %(program)s [options] [querytype] query
options:
-k, --key= Google license key (see important note below)
-1, -l, --lucky show only first hit
-m, --meta show meta information
-r, --reverse show results in reverse order
-x, --proxy= use HTTP proxy
-h, --help print this help
-v, --version print version and copyright information
-t, --test run test queries
querytype:
-s, --search= search (default)
-c, --cache= retrieve cached page
-p, --spelling= check spelling
IMPORTANT NOTE: all Google functions require a valid license key;
visit http://code.google.com/apis/soapsearch/ to get one. %(program)s will
look in these places (in order) and use the first license key it finds:
* the key specified on the command line""" % vars()
for get, location in _licenseLocations[2:]:
print " *", location
## ----------------------------------------------------------------------
## utility functions (API)
## ----------------------------------------------------------------------
def setLicense(license_key):
"""
Set the U{Google APIs } license
key
@param license_key: The new key to use
@type license_key: String
@todo: validate the key?
"""
global LICENSE_KEY
LICENSE_KEY = license_key
def getLicense(license_key = None):
"""
Get the U{Google APIs } license
key
The key can be read from any number of locations. See the module-leve
documentation for the search order.
@return: the license key
@rtype: String
@raise NoLicenseKey: if no valid key could be found
"""
for get, location in _licenseLocations:
rc = get(license_key)
if rc: return rc
_usage()
raise NoLicenseKey, 'get a license key at '\
'http://code.google.com/apis/soapsearch/'
def setProxy(http_proxy):
"""
Set the HTTP proxy to be used when accessing Google
@param http_proxy: the proxy to use
@type http_proxy: String
@todo: validiate the input?
"""
global HTTP_PROXY
HTTP_PROXY = http_proxy
def getProxy(http_proxy = None):
"""
Get the HTTP proxy we use for accessing Google
@return: the proxy
@rtype: String
"""
return http_proxy or HTTP_PROXY
def _contentsOf(dirname, filename):
filename = os.path.join(dirname, filename)
if not os.path.exists(filename): return None
fsock = open(filename)
contents = fsock.read()
fsock.close()
return contents
def _getScriptDir():
if __name__ == '__main__':
return os.path.abspath(os.path.dirname(sys.argv[0]))
else:
return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__))
def _marshalBoolean(value):
if value:
return _true
else:
return _false
def _getRemoteServer( http_proxy ):
return GoogleSOAPFacade.getProxy( _url, _namespace, http_proxy )
## ----------------------------------------------------------------------
## search results classes
## ----------------------------------------------------------------------
class _SearchBase:
def __init__(self, params):
for k, v in params.items():
if isinstance(v, GoogleSOAPFacade.structType):
v = GoogleSOAPFacade.toDict( v )
try:
if isinstance(v[0], GoogleSOAPFacade.structType):
v = [ SOAPProxy.toDict( node ) for node in v ]
except:
pass
self.__dict__[str(k)] = v
## ----------------------------------------------------------------------
class SearchResultsMetaData(_SearchBase):
"""
Container class for metadata about a given search query's results.
@ivar documentFiltering: is duplicate page filtering active?
@ivar searchComments: human-readable informational message
example::
"'the' is a very common word and was not included in your search"
@ivar estimatedTotalResultsCount: estimated total number of results
for this query.
@ivar estimateIsExact: is estimatedTotalResultsCount an exact value?
@ivar searchQuery: search string that initiated this search
@ivar startIndex: index of the first result returned (zero-based)
@ivar endIndex: index of the last result returned (zero-based)
@ivar searchTips: human-readable informational message on how to better
use Google.
@ivar directoryCategories: list of categories for the search results
This field is a list of dictionaries, like so::
{ 'fullViewableName': 'the Open Directory category',
'specialEncoding': 'encoding scheme of this directory category'
}
@ivar searchTime: total search time, in seconds
"""
pass
## ----------------------------------------------------------------------
class SearchResult(_SearchBase):
"""
Encapsulates the results from a search.
@ivar URL: URL
@ivar title: title (HTML)
@ivar snippet: snippet showing query context (HTML
@ivar cachedSize: size of cached version of this result, (KB)
@ivar relatedInformationPresent: is the "related:" keyword supported?
Flag indicates that the "related:" keyword is supported for this URL
@ivar hostName: used when filtering occurs
When filtering occurs, a maximum of two results from any given
host is returned. When this occurs, the second resultElement
that comes from that host contains the host name in this parameter.
@ivar directoryCategory: Open Directory category information
This field is a dictionary with the following values::
{ 'fullViewableName': 'the Open Directory category',
'specialEncoding' : 'encoding scheme of this directory category'
}
@ivar directoryTitle: Open Directory title of this result (or blank)
@ivar summary: Open Directory summary for this result (or blank)
"""
pass
## ----------------------------------------------------------------------
class SearchReturnValue:
"""
complete search results for a single query
@ivar meta: L{SearchResultsMetaData} instance for this query
@ivar results: list of L{SearchResult} objects for this query
"""
def __init__( self, metadata, results ):
self.meta = metadata
self.results = results
## ----------------------------------------------------------------------
## main functions
## ----------------------------------------------------------------------
def doGoogleSearch( q, start = 0, maxResults = 10, filter = 1,
restrict='', safeSearch = 0, language = '',
inputencoding = '', outputencoding = '',\
license_key = None, http_proxy = None ):
"""
Search Google using the SOAP API and return the results.
You need a license key to call this function; see the U{Google APIs
} site to get one. Then you can
either pass it to this function every time, or set it globally; see the
L{google} module-level docs for details.
See U{http://www.google.com/help/features.html}
for examples of advanced features. Anything that works at the
Google web site will work as a query string in this method.
You can use the C{start} and C{maxResults} parameters to page
through multiple pages of results. Note that 'maxResults' is
currently limited by Google to 10.
See the API reference for more advanced examples and a full list of
country codes and topics for use in the C{restrict} parameter, along
with legal values for the C{language}, C{inputencoding}, and
C{outputencoding} parameters.
You can download the API documentation
U{http://code.google.com/apis/soapsearch/download.html }.
@param q: search string.
@type q: String
@param start: (optional) zero-based index of first desired result.
@type start: int
@param maxResults: (optional) maximum number of results to return.
@type maxResults: int
@param filter: (optional) flag to request filtering of similar results
@type filter: int
@param restrict: (optional) restrict results by country or topic.
@type restrict: String
@param safeSearch: (optional)
@type safeSearch: int
@param language: (optional)
@type language: String
@param inputencoding: (optional)
@type inputencoding: String
@param outputencoding: (optional)
@type outputencoding: String
@param license_key: (optional) the Google API license key to use
@type license_key: String
@param http_proxy: (optional) the HTTP proxy to use for talking to Google
@type http_proxy: String
@return: the search results encapsulated in an object
@rtype: L{SearchReturnValue}
"""
license_key = getLicense( license_key )
http_proxy = getProxy( http_proxy )
remoteserver = _getRemoteServer( http_proxy )
filter = _marshalBoolean( filter )
safeSearch = _marshalBoolean( safeSearch )
data = remoteserver.doGoogleSearch( license_key, q, start, maxResults,
filter, restrict, safeSearch,
language, inputencoding,
outputencoding )
metadata = GoogleSOAPFacade.toDict( data )
del metadata["resultElements"]
metadata = SearchResultsMetaData( metadata )
results = [ SearchResult( GoogleSOAPFacade.toDict( node ) ) \
for node in data.resultElements ]
return SearchReturnValue( metadata, results )
## ----------------------------------------------------------------------
def doGetCachedPage( url, license_key = None, http_proxy = None ):
"""
Retrieve a page from the Google cache.
You need a license key to call this function; see the
U{Google APIs } site to get one.
Then you can either pass it to this function every time, or
set it globally; see the L{google} module-level docs for details.
@param url: full URL to the page to retrieve
@type url: String
@param license_key: (optional) the Google API key to use
@type license_key: String
@param http_proxy: (optional) the HTTP proxy server to use
@type http_proxy: String
@return: full text of the cached page
@rtype: String
"""
license_key = getLicense( license_key )
http_proxy = getProxy( http_proxy )
remoteserver = _getRemoteServer( http_proxy )
return remoteserver.doGetCachedPage( license_key, url )
## ----------------------------------------------------------------------
def doSpellingSuggestion( phrase, license_key = None, http_proxy = None ):
"""
Get spelling suggestions from Google
You need a license key to call this function; see the
U{Google APIs } site to get one.
Then you can either pass it to this function every time, or
set it globally; see the L{google} module-level docs for details.
@param phrase: word or phrase to spell-check
@type phrase: String
@param license_key: (optional) the Google API key to use
@type license_key: String
@param http_proxy: (optional) the HTTP proxy to use
@type http_proxy: String
@return: text of any suggested replacement, or None
"""
license_key = getLicense( license_key )
http_proxy = getProxy( http_proxy)
remoteserver = _getRemoteServer( http_proxy )
return remoteserver.doSpellingSuggestion( license_key, phrase )
## ----------------------------------------------------------------------
## functional test suite (see googletest.py for unit test suite)
## ----------------------------------------------------------------------
def _test():
"""
Run functional test suite.
"""
try:
getLicense(None)
except NoLicenseKey:
return
print "Searching for Python at google.com..."
data = doGoogleSearch( "Python" )
_output( data, { "func": "doGoogleSearch"} )
print "\nSearching for 5 _French_ pages about Python, "
print "encoded in ISO-8859-1..."
data = doGoogleSearch( "Python", language = 'lang_fr',
outputencoding = 'ISO-8859-1',
maxResults = 5 )
_output( data, { "func": "doGoogleSearch" } )
phrase = "Pyhton programming languager"
print "\nTesting spelling suggestions for '%s'..." % phrase
data = doSpellingSuggestion( phrase )
_output( data, { "func": "doSpellingSuggestion" } )
## ----------------------------------------------------------------------
## Command-line interface
## ----------------------------------------------------------------------
class _OutputFormatter:
def boil(self, data):
if type(data) == type(u""):
return data.encode("ISO-8859-1", "replace")
else:
return data
class _TextOutputFormatter(_OutputFormatter):
def common(self, data, params):
if params.get("showMeta", 0):
meta = data.meta
for category in meta.directoryCategories:
print "directoryCategory: %s" % \
self.boil(category["fullViewableName"])
for attr in [node for node in dir(meta) if \
node <> "directoryCategories" and node[:2] <> '__']:
print "%s:" % attr, self.boil(getattr(meta, attr))
def doGoogleSearch(self, data, params):
results = data.results
if params.get("feelingLucky", 0):
results = results[:1]
if params.get("reverseOrder", 0):
results.reverse()
for result in results:
for attr in dir(result):
if attr == "directoryCategory":
print "directoryCategory:", \
self.boil(result.directoryCategory["fullViewableName"])
elif attr[:2] <> '__':
print "%s:" % attr, self.boil(getattr(result, attr))
print
self.common(data, params)
def doGetCachedPage(self, data, params):
print data
self.common(data, params)
doSpellingSuggestion = doGetCachedPage
def _makeFormatter(outputFormat):
classname = "_%sOutputFormatter" % outputFormat.capitalize()
return globals()[classname]()
def _output(results, params):
formatter = _makeFormatter(params.get("outputFormat", "text"))
outputmethod = getattr(formatter, params["func"])
outputmethod(results, params)
def main(argv):
"""
Command-line interface.
"""
if not argv:
_usage()
return
q = None
func = None
http_proxy = None
license_key = None
feelingLucky = 0
showMeta = 0
reverseOrder = 0
runTest = 0
outputFormat = "text"
try:
opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1",
["search=", "cache=", "spelling=", "key=", "lucky", "meta",
"reverse", "proxy=", "help", "version", "test"])
except getopt.GetoptError:
_usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-s", "--search"):
q = arg
func = "doGoogleSearch"
elif opt in ("-c", "--cache"):
q = arg
func = "doGetCachedPage"
elif opt in ("-p", "--spelling"):
q = arg
func = "doSpellingSuggestion"
elif opt in ("-k", "--key"):
license_key = arg
elif opt in ("-l", "-1", "--lucky"):
feelingLucky = 1
elif opt in ("-m", "--meta"):
showMeta = 1
elif opt in ("-r", "--reverse"):
reverseOrder = 1
elif opt in ("-x", "--proxy"):
http_proxy = arg
elif opt in ("-h", "--help"):
_usage()
elif opt in ("-v", "--version"):
_version()
elif opt in ("-t", "--test"):
runTest = 1
if runTest:
setLicense(license_key)
setProxy(http_proxy)
_test()
if args and not q:
q = args[0]
func = "doGoogleSearch"
if func:
results = globals()[func]( q, http_proxy=http_proxy,
license_key=license_key )
_output(results, locals())
if __name__ == '__main__':
main(sys.argv[1:])