2003-08-21 00:47:19 +02:00
|
|
|
# babelizer.py - API for simple access to babelfish.altavista.com.
|
|
|
|
# Requires python 2.0 or better.
|
|
|
|
#
|
|
|
|
# See it in use at http://babel.MrFeinberg.com/
|
|
|
|
|
|
|
|
"""API for simple access to babelfish.altavista.com.
|
|
|
|
|
|
|
|
Summary:
|
|
|
|
|
|
|
|
import babelizer
|
2004-07-21 21:36:35 +02:00
|
|
|
|
2004-08-12 16:15:27 +02:00
|
|
|
print ' '.join(babelizer.available_languages)
|
2003-08-21 00:47:19 +02:00
|
|
|
|
|
|
|
print babelizer.translate( 'How much is that doggie in the window?',
|
2004-08-12 16:15:27 +02:00
|
|
|
'English', 'French' )
|
2003-08-21 00:47:19 +02:00
|
|
|
|
|
|
|
def babel_callback(phrase):
|
2004-08-12 16:15:27 +02:00
|
|
|
print phrase
|
|
|
|
sys.stdout.flush()
|
2004-07-21 21:36:35 +02:00
|
|
|
|
2004-08-12 16:15:27 +02:00
|
|
|
babelizer.babelize( 'I love a reigning knight.',
|
|
|
|
'English', 'German',
|
|
|
|
callback = babel_callback )
|
2003-08-21 00:47:19 +02:00
|
|
|
|
|
|
|
available_languages
|
|
|
|
A list of languages available for use with babelfish.
|
|
|
|
|
|
|
|
translate( phrase, from_lang, to_lang )
|
|
|
|
Uses babelfish to translate phrase from from_lang to to_lang.
|
|
|
|
|
|
|
|
babelize(phrase, from_lang, through_lang, limit = 12, callback = None)
|
|
|
|
Uses babelfish to translate back and forth between from_lang and
|
|
|
|
through_lang until either no more changes occur in translation or
|
|
|
|
limit iterations have been reached, whichever comes first. Takes
|
|
|
|
an optional callback function which should receive a single
|
|
|
|
parameter, being the next translation. Without the callback
|
|
|
|
returns a list of successive translations.
|
|
|
|
|
|
|
|
It's only guaranteed to work if 'english' is one of the two languages
|
|
|
|
given to either of the translation methods.
|
|
|
|
|
|
|
|
Both translation methods throw exceptions which are all subclasses of
|
|
|
|
BabelizerError. They include
|
|
|
|
|
|
|
|
LanguageNotAvailableError
|
|
|
|
Thrown on an attempt to use an unknown language.
|
|
|
|
|
|
|
|
BabelfishChangedError
|
|
|
|
Thrown when babelfish.altavista.com changes some detail of their
|
|
|
|
layout, and babelizer can no longer parse the results or submit
|
|
|
|
the correct form (a not infrequent occurance).
|
|
|
|
|
|
|
|
BabelizerIOError
|
|
|
|
Thrown for various networking and IO errors.
|
|
|
|
|
|
|
|
Version: $Id$
|
|
|
|
Author: Jonathan Feinberg <jdf@pobox.com>
|
|
|
|
"""
|
|
|
|
import re, string, urllib
|
|
|
|
|
|
|
|
"""
|
|
|
|
Various patterns I have encountered in looking for the babelfish result.
|
|
|
|
We try each of them in turn, based on the relative number of times I've
|
|
|
|
seen each of these patterns. $1.00 to anyone who can provide a heuristic
|
|
|
|
for knowing which one to use. This includes AltaVista employees.
|
|
|
|
"""
|
|
|
|
__where = [ re.compile(r'lang=..>([^<]*)</div'),
|
|
|
|
re.compile(r'name=\"q\" value=\"([^\"]*)\">'),
|
2004-04-18 21:55:46 +02:00
|
|
|
re.compile(r'div style=padding:10px;>([^<]+)</div'),
|
2003-08-21 00:47:19 +02:00
|
|
|
]
|
|
|
|
|
|
|
|
__languages = { 'english' : 'en',
|
|
|
|
'chinese' : 'zh',
|
|
|
|
'french' : 'fr',
|
|
|
|
'german' : 'de',
|
|
|
|
'italian' : 'it',
|
|
|
|
'japanese' : 'ja',
|
|
|
|
'korean' : 'ko',
|
|
|
|
'spanish' : 'es',
|
2003-08-21 14:01:25 +02:00
|
|
|
'portuguese' : 'pt',
|
2003-08-21 00:47:19 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
"""
|
|
|
|
All of the available language names.
|
|
|
|
"""
|
|
|
|
available_languages = [ x.title() for x in __languages.keys() ]
|
|
|
|
|
|
|
|
"""
|
|
|
|
Calling translate() or babelize() can raise a BabelizerError
|
|
|
|
"""
|
|
|
|
class BabelizerError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class LanguageNotAvailableError(BabelizerError):
|
|
|
|
pass
|
|
|
|
class BabelfishChangedError(BabelizerError):
|
|
|
|
pass
|
|
|
|
class BabelizerIOError(BabelizerError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def clean(text):
|
|
|
|
return ' '.join(string.replace(text.strip(), "\n", ' ').split())
|
|
|
|
|
|
|
|
def translate(phrase, from_lang, to_lang):
|
|
|
|
phrase = clean(phrase)
|
|
|
|
try:
|
|
|
|
from_code = __languages[from_lang.lower()]
|
|
|
|
to_code = __languages[to_lang.lower()]
|
|
|
|
except KeyError, lang:
|
|
|
|
raise LanguageNotAvailableError(lang)
|
2004-07-21 21:36:35 +02:00
|
|
|
|
2003-08-21 00:47:19 +02:00
|
|
|
params = urllib.urlencode( { 'BabelFishFrontPage' : 'yes',
|
|
|
|
'doit' : 'done',
|
|
|
|
'tt' : 'urltext',
|
|
|
|
'intl' : '1',
|
|
|
|
'urltext' : phrase,
|
|
|
|
'lp' : from_code + '_' + to_code } )
|
|
|
|
try:
|
|
|
|
response = urllib.urlopen('http://babelfish.altavista.com/babelfish/tr', params)
|
|
|
|
except IOError, what:
|
|
|
|
raise BabelizerIOError("Couldn't talk to server: %s" % what)
|
|
|
|
except:
|
|
|
|
print "Unexpected error:", sys.exc_info()[0]
|
|
|
|
|
|
|
|
html = response.read()
|
2004-08-12 17:04:52 +02:00
|
|
|
try:
|
|
|
|
begin = html.index('<!-- Target text (content) -->')
|
|
|
|
end = html.index('<!-- end: Target text (content) -->')
|
|
|
|
html = html[begin:end]
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2003-08-21 00:47:19 +02:00
|
|
|
for regex in __where:
|
|
|
|
match = regex.search(html)
|
2003-08-21 14:25:35 +02:00
|
|
|
if match:
|
|
|
|
break
|
|
|
|
if not match:
|
|
|
|
raise BabelfishChangedError("Can't recognize translated string.")
|
2003-08-21 00:47:19 +02:00
|
|
|
return clean(match.group(1))
|
|
|
|
|
|
|
|
def babelize(phrase, from_language, through_language, limit = 12, callback = None):
|
|
|
|
phrase = clean(phrase)
|
|
|
|
seen = { phrase: 1 }
|
2004-08-12 17:04:52 +02:00
|
|
|
results = []
|
2003-08-21 00:47:19 +02:00
|
|
|
if callback:
|
2004-08-12 17:04:52 +02:00
|
|
|
def_callback = callback
|
2003-08-21 00:47:19 +02:00
|
|
|
else:
|
2004-08-12 17:04:52 +02:00
|
|
|
def_callback = results.append
|
|
|
|
def_callback(phrase)
|
2003-08-21 00:47:19 +02:00
|
|
|
flip = { from_language: through_language, through_language: from_language }
|
|
|
|
next = from_language
|
|
|
|
for i in range(limit):
|
|
|
|
phrase = translate(phrase, next, flip[next])
|
2003-08-21 14:25:35 +02:00
|
|
|
if seen.has_key(phrase):
|
|
|
|
break
|
2003-08-21 00:47:19 +02:00
|
|
|
seen[phrase] = 1
|
2004-08-12 17:04:52 +02:00
|
|
|
def_callback(phrase)
|
2003-08-21 00:47:19 +02:00
|
|
|
next = flip[next]
|
2004-08-12 16:15:27 +02:00
|
|
|
# next is set to the language of the last entry. this should be the same
|
|
|
|
# as the language we are translating to
|
|
|
|
if next != through_language:
|
2003-08-21 00:47:19 +02:00
|
|
|
phrase = translate(phrase, next, flip[next])
|
2004-08-12 17:04:52 +02:00
|
|
|
def_callback(phrase)
|
2003-08-21 14:25:35 +02:00
|
|
|
if not callback:
|
|
|
|
return results
|
2003-08-21 00:47:19 +02:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
import sys
|
|
|
|
def printer(x):
|
|
|
|
print x
|
|
|
|
sys.stdout.flush();
|
|
|
|
|
2004-07-21 21:36:35 +02:00
|
|
|
|
2003-08-21 00:47:19 +02:00
|
|
|
babelize("I won't take that sort of treatment from you, or from your doggie!",
|
|
|
|
'english', 'french', callback = printer)
|
|
|
|
|