#!/usr/bin/env kross import socket import urllib2 import urllib from sgmllib import SGMLParser import Parley import re # timeout of search (important for slow connections, not to freeze Parley by waiting for a result) timeout = 1.0 socket.setdefaulttimeout(timeout) # fetches the html document for the given word and language pair def fetchData(word,from_lang,to_lang): url = "http://translate.google.com/translate_dict" param_word_trn = ("q",word) #set query parameter param_lang_pair = ("langpair",from_lang+"|"+to_lang) request_url = url + "?" + urllib.urlencode([param_word_trn,param_lang_pair]) #print request_url try: results = urllib2.urlopen(request_url) return results.read() except: #in case of error not to return incompleted results return "" #parses data and returns the parser object (that contains the translations/langpairs found) def parseData(data,word,from_lang,to_lang): p = myParser() p.word = word p.from_lang = from_lang p.to_lang = to_lang p.feed(data) p.close() return p #corrects the difference between the locale names of Parley and the google dictionary def locale(lang): if lang == "en_US": return "en" if lang == "zh_TW": return "zh-TW" if lang == "zh_HK": return "zh-HK" if lang == "zh_CN": return "zh-CN" return lang # called by Parley to translate the word def translateWord(word,from_lang,to_lang): print "google_dictionary.py - Translating",word,from_lang,to_lang data = fetchData(word,locale(from_lang),locale(to_lang)) parser = parseData(data,word,from_lang,to_lang) #return parser.words # called by Parley to retrieve the language pairs provided by this script # should return: [("en","fr"),("en","de")] for translation from english to french and english to german def getLanguagePairs(): data = fetchData("ignorethis","en","fr") parser = parseData(data) return map(split_langpair,parser.langpairs) # function to split a language pair string into a tuple def split_langpair(s): [f,t] = s.split("|",1) return (f,t) # ------------ HTML Parser ----------- # class myParser(SGMLParser): #for every start_tagname function you add you have to make sure the tag is added to the self.tags_stack def reset(self): SGMLParser.reset(self) self.words = [] #translated words found in html self.langpairs = [] #language pairs found in html file self.tags_stack = [] self.stop = False def unknown_starttag(self,tag,attrs): self.tags_stack.append(tag) #print "unknown : ", tag, " ", len(self.tags_stack) def start_span(self, attrs): #print "known : ", "span", " ", len(self.tags_stack) if ("class","definition") in attrs: self.tags_stack.append("") #marks tag with to get its data in handle_data else: self.tags_stack.append("span") def start_option(self, attrs): for name,value in attrs: if name == "value": self.langpairs.append(value) self.tags_stack.append("option") def handle_data(self,data): if data == "Web definitions": self.stop = True #to make it stop after the web definitions if len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] == "": #print "data: ", data self.words.append(data.strip()) #print self.word, self.from_lang, self.to_lang if self.stop == False: w = self.clearWord(data) Parley.addTranslation(self.word,self.from_lang,self.to_lang,w) def unknown_endtag(self,tag): myParser.remove_not_closed_tags(self,tag) if len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] == tag: #print "end_tag : ", tag, " ", len(self.tags_stack) self.tags_stack.pop() #removes all the tags from the stack that have no closed tags (don't modify) def remove_not_closed_tags(self,tag): while len(self.tags_stack) > 0 and self.tags_stack[len(self.tags_stack)-1] != tag: self.tags_stack.pop() #cleans up the given word from parentheses etc def clearWord(self,word): #word = "b[lue] socks (and) red shoes" p = re.compile( '(jmdn\.|etw\.)') word = p.sub( '', word) p = re.compile( '(\(.*\))') word = p.sub( '', word) p = re.compile( '(\[.*\])') word = p.sub( '', word) p = re.compile( '(\W)',re.UNICODE) word = p.sub( ' ', word) #replace double spaces produced from the previous ones p = re.compile( '(\s\s)') word = p.sub( ' ', word) return word.strip()