Skip to content

Commit 8f89d5a

Browse files
authored
Add files via upload
1 parent 5acb83f commit 8f89d5a

12 files changed

+289
-121
lines changed

pygoogletranslation/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Free Google Translate Web API for Python. Translates totally free of charge."""
22
__all__ = 'Translator',
3-
__version__ = '1.0.0'
3+
__version__ = '2.0.4'
44

55

66
from pygoogletranslation.translate import Translator
7-
from pygoogletranslation.constants import LANGCODES, LANGUAGES # noqa
7+
from pygoogletranslation.constants import LANGCODES, LANGUAGES
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

pygoogletranslation/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
RPCIDS = 'MkEWBc'
12

23
LANGUAGES = {
34
'af': 'afrikaans',

pygoogletranslation/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ def __str__(self): # pragma: nocover
2121
def __unicode__(self): # pragma: nocover
2222
return (
2323
u'Translated(src={src}, dest={dest}, text={text}, pronunciation={pronunciation}, '
24-
u'extra_data={extra_data})'.format(
24+
u'original_text={origin} ,extra_data={extra_data})'.format(
2525
src=self.src, dest=self.dest, text=self.text,
26-
pronunciation=self.pronunciation,
26+
pronunciation=self.pronunciation, origin=self.origin,
2727
extra_data='"' + repr(self.extra_data)[:10] + '..."'
2828
)
2929
)

pygoogletranslation/translate.py

Lines changed: 170 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,28 @@
44
55
You can translate text using this module.
66
"""
7+
import os
78
import json
89
import requests
10+
import unidecode
11+
import docx2txt
12+
import PyPDF2
13+
import time
914
from pygoogletranslation import utils, urls
1015
from pygoogletranslation.constants import (
11-
LANGCODES, LANGUAGES
16+
LANGCODES, LANGUAGES, RPCIDS
1217
)
1318
from pygoogletranslation import gauthtoken
1419
from pygoogletranslation.models import Translated, Detected
1520

16-
EXCLUDES = ('en', 'ca', 'fr')
1721

1822
class Translator:
1923

20-
def __init__(self, host=urls.TRANSLATE, proxies=None, timeout=None):
24+
def __init__(self, host=urls.TRANSLATE, proxies=None, timeout=None,
25+
retry=3, sleep=5, retry_messgae=False):
2126
self.host = host if 'http' in host else 'https://' + host
22-
27+
self.rpcids = RPCIDS
28+
self.transurl = urls.TRANSLATEURL
2329
if proxies is not None:
2430
self.proxies = proxies
2531
else:
@@ -28,55 +34,81 @@ def __init__(self, host=urls.TRANSLATE, proxies=None, timeout=None):
2834
if timeout is not None:
2935
self.timeout = timeout
3036

31-
def translate(self, text, src='auto', dest='en'):
37+
self.retry = retry
38+
self.retry_messgae = retry_messgae
39+
self.sleep = sleep
3240

41+
def translate(self, text, src='auto', dest='en'):
42+
if type(text) == list:
43+
i = 0
44+
for _text in text:
45+
_text = _text.replace('"', '')
46+
_text = _text.replace("'", "")
47+
_text = _text.replace("“", "")
48+
_text = _text.replace("”", "")
49+
text[i] = _text
50+
i += 1
51+
else:
52+
text = text.replace('"', '')
53+
text = text.replace("'", "")
54+
text = text.replace("“", "")
55+
text = text.replace("”", "")
56+
3357
if src != 'auto':
34-
if src in LANGCODES:
35-
src_lang = LANGCODES[src]
58+
if src.lower() in LANGCODES:
59+
src = LANGCODES[src]
60+
elif src.lower() in LANGUAGES:
61+
src = src
3662
else:
3763
raise ValueError('invalid source language')
3864

3965
if dest != 'en':
40-
if dest in LANGCODES:
41-
dest = LANGCODES[src]
42-
elif dest in LANGUAGES:
66+
if dest.lower() in LANGCODES:
67+
dest = LANGCODES[src.lower()]
68+
elif dest.lower() in LANGUAGES:
4369
dest = dest
4470
else:
4571
raise ValueError('invalid destination language')
4672

4773
data = self._translate(text, src=src, dest=dest)
74+
return self.extract_translation(data, text)
4875

49-
# this code will be updated when the format is changed.
50-
translated = ''.join([d[0] if d[0] else '' for d in data[0]])
51-
52-
extra_data = self._parse_extra_data(data)
53-
54-
# actual source language that will be recognized by Google Translator when the
55-
# src passed is equal to auto.
56-
try:
57-
src = data[2]
58-
except Exception: # pragma: nocover
59-
pass
60-
61-
pron = text
62-
try:
63-
pron = data[0][1][-2]
64-
except Exception: # pragma: nocover
65-
pass
66-
67-
if pron is None:
76+
77+
def extract_translation(self, _data, text, src='auto', dest='en'):
78+
if type(text) != list:
79+
text = [text]
80+
result_list = []
81+
c = 0
82+
for data in _data:
83+
try:
84+
translated = data[0][2][1][0][0][5][0][0]
85+
except:
86+
translated = ""
87+
extra_data = {}
6888
try:
69-
pron = data[0][1][2]
70-
except: # pragma: nocover
89+
src = data[0][2][3][5][0][0][3]
90+
except Exception: # pragma: nocover
7191
pass
7292

73-
if dest in EXCLUDES and pron == text:
74-
pron = translated
93+
try:
94+
dest = data[0][2][3][5][0][0][2]
95+
except Exception: # pragma: nocover
96+
pass
7597

76-
# put final values into a new Translated object
77-
result = Translated(src=src, dest=dest, origin=text,
78-
text=translated, pronunciation=pron, extra_data=extra_data)
79-
return result
98+
pron = None
99+
try:
100+
pron = unidecode.unidecode(data[0][2][1][0][0][1])
101+
except Exception: # pragma: nocover
102+
pass
103+
# put final values into a new Translated object
104+
result = Translated(src=src, dest=dest, origin=text[c],
105+
text=translated, pronunciation=pron, extra_data=extra_data)
106+
result_list.append(result)
107+
c += 1
108+
if len(result_list) == 1:
109+
return result_list[0]
110+
else:
111+
return result_list
80112

81113
def detect(self, text, **kwargs):
82114
"""Detect language of the input text
@@ -116,44 +148,21 @@ def detect(self, text, **kwargs):
116148
result.append(lang)
117149
return result
118150

119-
data = self._translate(text, 'en', 'auto', kwargs)
151+
data = self._translate(text, 'auto', 'en')
120152

121153
# actual source language that will be recognized by Google Translator when the
122154
# src passed is equal to auto.
123155
src = ''
124156
confidence = 0.0
125157
try:
126-
src = ''.join(data[8][0])
127-
confidence = data[8][-2][0]
158+
src = data[0][0][2][3][5][0][0][3]
159+
# confidence = data[8][-2][0]
128160
except Exception: # pragma: nocover
129161
pass
130162
result = Detected(lang=src, confidence=confidence)
131163

132164
return result
133-
134-
135-
def _parse_extra_data(self, data):
136-
response_parts_name_mapping = {
137-
0: 'translation',
138-
1: 'all-translations',
139-
2: 'original-language',
140-
5: 'possible-translations',
141-
6: 'confidence',
142-
7: 'possible-mistakes',
143-
8: 'language',
144-
11: 'synonyms',
145-
12: 'definitions',
146-
13: 'examples',
147-
14: 'see-also',
148-
}
149-
150-
extra = {}
151-
152-
for index, category in response_parts_name_mapping.items():
153-
extra[category] = data[index] if (index < len(data) and data[index]) else None
154-
155-
return extra
156-
165+
157166
def _translate(self, text, src, dest):
158167
""" Generate Token for each Translation and post requset to
159168
google web api translation and return an response
@@ -162,21 +171,105 @@ def _translate(self, text, src, dest):
162171
else other status code are consider as translation failure.
163172
164173
"""
165-
gtoken = gauthtoken.TokenAcquirer(proxies=self.proxies)
166-
token = gtoken.acquire(text)
167-
querystring = utils.format_querystring(token, text, src=src, dest=dest)
168-
response = requests.post(url=self.host + 't', params=querystring, proxies=self.proxies)
169-
if response.status_code == 200:
170-
translated_text = utils.format_json(response.content)
171-
return translated_text
172-
else:
173-
raise Exception('Unexpected status code {} from {}'.format(response.status_code, self.host))
174-
return False
174+
if type(text) != list:
175+
text = [text]
176+
translated_list = []
177+
url = self.transurl
178+
params = utils.format_param(self.rpcids)
179+
for _text in text:
180+
trans_list = []
181+
tokenized_text = utils.tokenize_sentence(_text)
182+
for _tokenized_text in tokenized_text:
183+
data = utils.format_data(self.rpcids, _tokenized_text, src, dest)
184+
response = requests.request("POST", url, data=data, params=params, proxies=self.proxies)
185+
if response.status_code == 200:
186+
_format_data = utils.format_response(str(response.text))
187+
trans_list.append(_format_data)
188+
elif response.status_code == 429:
189+
_format_data = self.retry_request(data, params)
190+
trans_list.append(_format_data)
191+
else:
192+
raise Exception('Unexpected status code {} from {}'.format(response.status_code, self.transurl))
193+
return False
194+
translated_list.append(utils.format_translation(trans_list))
195+
return translated_list
196+
197+
def retry_request(self, data, params):
198+
"""
199+
For bulk translation some times translation might failed
200+
beacuse of too many attempts. for such a case before hitting
201+
translation api wait for some time and retrying again
202+
"""
203+
retry = self.retry
204+
sleep = self.sleep
205+
response = requests.request("POST", url=self.transurl, data=data, params=params, proxies=self.proxies)
206+
for i in range(0, retry):
207+
if response.status_code == 200:
208+
_format_data = utils.format_response(str(response.text))
209+
return _format_data
210+
elif response.status_code == 429:
211+
if self.retry_messgae:
212+
print('retrying translation after {}s'.format(sleep))
213+
time.sleep(sleep)
214+
sleep = i * sleep
215+
else:
216+
raise Exception('Unexpected status code {} from {}'.format(response.status_code, self.transurl))
217+
return False
218+
raise Exception('Unexpected status code {} from {} after retried {} loop with {}s delay'.format(response.status_code, self.transurl, retry, self.sleep))
219+
220+
def bulktranslate(self, file, src='auto', dest='en'):
221+
"""Translation from document (.doc, .docx, .pdf, .txt):
222+
---------------------------------------------
223+
>>> from pygoogletranslation import Translator
224+
>>> translator = Translator()
225+
>>> translator.bulktranslate('test.txt', dest="ta")
226+
# <bulk translated text>
227+
"""
228+
if src != 'auto':
229+
if src.lower() in LANGCODES:
230+
src = LANGCODES[src.lower()]
231+
elif src.lower() in LANGUAGES:
232+
src = src
233+
else:
234+
raise ValueError('invalid source language')
175235

176-
236+
if dest != 'en':
237+
if dest.lower() in LANGCODES:
238+
dest = LANGCODES[src.lower()]
239+
elif dest.lower() in LANGUAGES:
240+
dest = dest
241+
else:
242+
raise ValueError('invalid destination language')
243+
244+
if not os.path.exists(file):
245+
raise FileNotFoundError('file {} does not exists !'.format(file))
246+
247+
# Read document file, pdf file, text file
248+
if file.endswith('.doc') or file.endswith('.docx'):
249+
text = docx2txt.process(file)
250+
elif file.endswith('.txt'):
251+
_file = open(file, 'r')
252+
text = _file.read()
253+
_file.close()
254+
elif file.endswith('.pdf'):
255+
text = ''
256+
pdfFileObj = open(file, 'rb')
257+
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
258+
for i in range(0, pdfReader.numPages):
259+
pageObj = pdfReader.getPage(0)
260+
text += pageObj.extractText()
261+
pdfFileObj.close()
262+
else:
263+
raise FileNotFoundError('unsupported file format .{}.'.format(file.split('.'))[len(file.split('.') - 1)])
264+
text = text.replace('"', '')
265+
text = text.replace("'", "")
266+
text = text.replace("“", "")
267+
text = text.replace("”", "")
268+
data = self._translate(text, src=src, dest=dest)
269+
return self.extract_translation(data, text)
270+
177271
def glanguage(self):
178272
""" Get request from google and return language and their lang codes.
179-
180273
Example:
181274
>>> translate = Translator()
182275
>>> translate.glanguage()
@@ -195,14 +288,11 @@ def glanguage(self):
195288
"al": {}
196289
}
197290
"""
198-
199291
querystring = utils.format_querystringlang()
200292
response = requests.get(url=self.host + 'l', params=querystring, proxies=self.proxies)
201293
if response.status_code == 200:
202294
glang = json.loads(response.content)
203295
return glang
204296
else:
205297
raise Exception('Unexpected status code {} from {}'.format(response.status_code, self.host))
206-
return False
207-
208-
298+
return False

pygoogletranslation/urls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
BASE = 'https://translate.google.com'
66
TOKEN = 'https://translate.google.com/translate_a/element.js'
77
TRANSLATE = 'https://translate.googleapis.com/translate_a/'
8+
TRANSLATEURL = 'https://translate.google.com/_/TranslateWebserverUi/data/batchexecute'

0 commit comments

Comments
 (0)