"""Python wrapper for Google web APIs This module allows you to access Google's web APIs through SOAP, to do things like search Google and get the results programmatically. Described here: http://www.google.com/apis/ You need a Google-provided license key to use these services. Follow the link above to get one. These functions will look in several places (in this order) for the license key: - the "license_key" argument of each function - the module-level LICENSE_KEY variable (call setLicense once to set it) - an environment variable called GOOGLE_LICENSE_KEY - a file called ".googlekey" in the current directory - a file called "googlekey.txt" in the current directory - a file called ".googlekey" in your home directory - a file called "googlekey.txt" in your home directory - a file called ".googlekey" in the same directory as google.py - a file called "googlekey.txt" in the same directory as google.py Sample usage: >>> import google >>> google.setLicense('...') # must get your own key! >>> data = google.doGoogleSearch('python') >>> data.meta.searchTime 0.043221000000000002 >>> data.results[0].URL 'http://www.python.org/' >>> data.results[0].title '<b>Python</b> Language Website' See documentation of SearchResultsMetaData and SearchResult classes for other available attributes. """ __author__ = "Mark Pilgrim (f8dy@diveintomark.org)" __version__ = "0.5.2" __cvsversion__ = "$Revision$"[11:-2] __date__ = "$Date$"[7:-2] __copyright__ = "Copyright (c) 2002 Mark Pilgrim" __license__ = "Python" __credits__ = """David Ascher, for the install script Erik Max Francis, for the command line interface Michael Twomey, for HTTP proxy support""" import SOAP import os, sys, getopt LICENSE_KEY = None HTTP_PROXY = None # don't touch the rest of these constants class NoLicenseKey(Exception): pass _url = 'http://api.google.com/search/beta2' _namespace = 'urn:GoogleSearch' _false = SOAP.booleanType(0) _true = SOAP.booleanType(1) _googlefile1 = ".googlekey" _googlefile2 = "googlekey.txt" _licenseLocations = ( (lambda key: key, 'passed to the function in license_key variable'), (lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)'), (lambda key: os.environ.get('GOOGLE_LICENSE_KEY', None), 'an environment variable called GOOGLE_LICENSE_KEY'), (lambda key: _contentsOf(os.getcwd(), _googlefile1), '%s in the current directory' % _googlefile1), (lambda key: _contentsOf(os.getcwd(), _googlefile2), '%s in the current directory' % _googlefile2), (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile1), '%s in your home directory' % _googlefile1), (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile2), '%s in your home directory' % _googlefile2), (lambda key: _contentsOf(_getScriptDir(), _googlefile1), '%s in the google.py directory' % _googlefile1), (lambda key: _contentsOf(_getScriptDir(), _googlefile2), '%s in the google.py directory' % _googlefile2) ) ## administrative functions def version(): print """PyGoogle %(__version__)s %(__copyright__)s released %(__date__)s Thanks to: %(__credits__)s""" % globals() def usage(): program = os.path.basename(sys.argv[0]) print """Usage: %(program)s [options] [querytype] query options: -k, --key= <license key> Google license key (see important note below) -1, -l, --lucky show only first hit -m, --meta show meta information -r, --reverse show results in reverse order -x, --proxy= <url> use HTTP proxy -h, --help print this help -v, --version print version and copyright information -t, --test run test queries querytype: -s, --search= <query> search (default) -c, --cache= <url> retrieve cached page -p, --spelling= <word> check spelling IMPORTANT NOTE: all Google functions require a valid license key; visit http://www.google.com/apis/ to get one. %(program)s will look in these places (in order) and use the first license key it finds: * the key specified on the command line""" % vars() for get, location in _licenseLocations[2:]: print " *", location ## utility functions def setLicense(license_key): """set license key""" global LICENSE_KEY LICENSE_KEY = license_key def getLicense(license_key = None): """get license key license key can come from any number of locations; see module docs for search order""" for get, location in _licenseLocations: rc = get(license_key) if rc: return rc #usage() raise NoLicenseKey, 'get a license key at http://www.google.com/apis/' def setProxy(http_proxy): """set HTTP proxy""" global HTTP_PROXY HTTP_PROXY = http_proxy def getProxy(http_proxy = None): """get HTTP proxy""" return http_proxy or HTTP_PROXY def _contentsOf(dirname, filename): filename = os.path.join(dirname, filename) if not os.path.exists(filename): return None fsock = open(filename) contents = fsock.read() fsock.close() return contents def _getScriptDir(): if __name__ == '__main__': return os.path.abspath(os.path.dirname(sys.argv[0])) else: return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__)) def _marshalBoolean(value): if value: return _true else: return _false ## output formatters def makeFormatter(outputFormat): classname = "%sOutputFormatter" % outputFormat.capitalize() return globals()[classname]() def output(results, params): formatter = makeFormatter(params.get("outputFormat", "text")) outputmethod = getattr(formatter, params["func"]) outputmethod(results, params) class OutputFormatter: def boil(self, data): if type(data) == type(u""): return data.encode("utf-8", "replace") else: return data class TextOutputFormatter(OutputFormatter): def common(self, data, params): if params.get("showMeta", 0): meta = data.meta for category in meta.directoryCategories: print "directoryCategory: %s" % self.boil(category["fullViewableName"]) for attr in [node for node in dir(meta) if node <> "directoryCategories" and node[:2] <> '__']: print "%s:" % attr, self.boil(getattr(meta, attr)) def doGoogleSearch(self, data, params): results = data.results if params.get("feelingLucky", 0): results = results[:1] if params.get("reverseOrder", 0): results.reverse() for result in results: for attr in dir(result): if attr == "directoryCategory": print "directoryCategory:", self.boil(result.directoryCategory["fullViewableName"]) elif attr[:2] <> '__': print "%s:" % attr, self.boil(getattr(result, attr)) print self.common(data, params) def doGetCachedPage(self, data, params): print data self.common(data, params) doSpellingSuggestion = doGetCachedPage ## search results classes class _SearchBase: def __init__(self, params): for k, v in params.items(): if isinstance(v, SOAP.structType): v = v._asdict try: if isinstance(v[0], SOAP.structType): v = [node._asdict for node in v] except: pass self.__dict__[str(k)] = v class SearchResultsMetaData(_SearchBase): """metadata of search query results Available attributes: documentFiltering - flag indicates whether duplicate page filtering was perfomed in this search searchComments - human-readable informational message (example: "'the' is a very common word and was not included in your search") estimatedTotalResultsCount - estimated total number of results for this query estimateIsExact - flag indicates whether estimatedTotalResultsCount is an exact value searchQuery - search string that initiated this search startIndex - index of first result returned (zero-based) endIndex - index of last result returned (zero-based) searchTips - human-readable informational message on how to use Google bette directoryCategories - list of dictionaries like this: {'fullViewableName': Open Directory category, 'specialEncoding': encoding scheme of this directory category} searchTime - total search time, in seconds """ pass class SearchResult(_SearchBase): """search result Available attributes: URL - URL title - title (HTML) snippet - snippet showing query context (HTML) cachedSize - size of cached version of this result, (KB) relatedInformationPresent - flag indicates that the "related:" keyword is supported for this URL hostName: When filtering occurs, a maximum of two results from any given host is returned. When this occurs, the second resultElement that comes from that host contains the host name in this parameter. directoryCategory: dictionary like this: {'fullViewableName': Open Directory category, 'specialEncoding': encoding scheme of this directory category} directoryTitle: Open Directory title of this result (or blank) summary - Open Directory summary for this result (or blank) """ pass class SearchReturnValue: """complete search results for a single query Available attributes: meta - SearchResultsMetaData results - list of SearchResult """ def __init__(self, metadata, results): self.meta = metadata self.results = results ## main functions def doGoogleSearch(q, start=0, maxResults=10, filter=1, restrict='', safeSearch=0, language='', inputencoding='', outputencoding='', license_key = None, http_proxy = None): """search Google You need a license key to call this function; see http://www.google.com/apis/ to get one. Then you can either pass it to this function every time, or set it globally; see the module docs for details. Parameters: q - search string. Anything you could type at google.com, you can pass here. See http://www.google.com/help/features.html for examples of advanced features. start (optional) - zero-based index of first desired result (for paging through multiple pages of results) maxResults (optional) - maximum number of results, currently capped at 10 filter (optional) - set to 1 to filter out similar results, set to 0 to see everything restrict (optional) - restrict results by country or topic. Examples: Ukraine - search only sites located in Ukraine linux - search Linux sites only mac - search Mac sites only bsd - search FreeBSD sites only See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) for more advanced examples and a full list of country codes and topics. safeSearch (optional) - set to 1 to filter results with SafeSearch (no adult material) language (optional) - restricts search to documents in one or more languages. Example: lang_en - only return pages in English lang_fr - only return pages in French See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) for more advanced examples and a full list of language codes. inputencoding (optional) - sets the character encoding of q parameter outputencoding (optional) - sets the character encoding of the returned results See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) for a full list of encodings. http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages Returns: SearchReturnValue .meta - SearchMetaData .results - list of SearchResult See documentation of these individual classes for list of available attributes """ http_proxy = getProxy(http_proxy) remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) license_key = getLicense(license_key) filter = _marshalBoolean(filter) safeSearch = _marshalBoolean(safeSearch) data = remoteserver.doGoogleSearch(license_key, q, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding) metadata = data._asdict del metadata["resultElements"] metadata = SearchResultsMetaData(metadata) results = [SearchResult(node._asdict) for node in data.resultElements] return SearchReturnValue(metadata, results) def doGetCachedPage(url, license_key = None, http_proxy = None): """get page from Google cache You need a license key to call this function; see http://www.google.com/apis/ to get one. Then you can either pass it to this function every time, or set it globally; see the module docs for details. Parameters: url - address of page to get license_key (optional) - Google license key http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages Returns: string, text of cached page """ http_proxy = getProxy(http_proxy) remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) license_key = getLicense(license_key) return remoteserver.doGetCachedPage(license_key, url) def doSpellingSuggestion(phrase, license_key = None, http_proxy = None): """get spelling suggestions from Google You need a license key to call this function; see http://www.google.com/apis/ to get one. Then you can either pass it to this function every time, or set it globally; see the module docs for details. Parameters: phrase - word or phrase to spell-check http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages Returns: text of suggested replacement, or None """ http_proxy = getProxy(http_proxy) remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) license_key = getLicense(license_key) return remoteserver.doSpellingSuggestion(license_key, phrase) ## functional test suite (see googletest.py for unit test suite) def test(): try: getLicense(None) except NoLicenseKey: return print "Searching for Python at google.com..." data = doGoogleSearch("Python") output(data, {"func": "doGoogleSearch"}) print "\nSearching for 5 _French_ pages about Python, encoded in ISO-8859-1..." data = doGoogleSearch("Python", language='lang_fr', outputencoding='ISO-8859-1', maxResults=5) output(data, {"func": "doGoogleSearch"}) phrase = "Pyhton programming languager" print "\nTesting spelling suggetions for '%s'..." % phrase data = doSpellingSuggestion(phrase) output(data, {"func": "doSpellingSuggestion"}) ## main driver for command-line use def main(argv): if not argv: usage() return q = None func = None http_proxy = None license_key = None feelingLucky = 0 showMeta = 0 reverseOrder = 0 runTest = 0 outputFormat = "text" try: opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1", ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"]) except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt in ("-s", "--search"): q = arg func = "doGoogleSearch" elif opt in ("-c", "--cache"): q = arg func = "doGetCachedPage" elif opt in ("-p", "--spelling"): q = arg func = "doSpellingSuggestion" elif opt in ("-k", "--key"): license_key = arg elif opt in ("-l", "-1", "--lucky"): feelingLucky = 1 elif opt in ("-m", "--meta"): showMeta = 1 elif opt in ("-r", "--reverse"): reverseOrder = 1 elif opt in ("-x", "--proxy"): http_proxy = arg elif opt in ("-h", "--help"): usage() elif opt in ("-v", "--version"): version() elif opt in ("-t", "--test"): runTest = 1 if runTest: setLicense(license_key) setProxy(http_proxy) test() if args and not q: q = args[0] func = "doGoogleSearch" if func: results = globals()[func](q, http_proxy=http_proxy, license_key=license_key) output(results, locals()) if __name__ == '__main__': main(sys.argv[1:]) # vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: