4
4
5
5
You can translate text using this module.
6
6
"""
7
+ import os
7
8
import json
8
9
import requests
10
+ import unidecode
11
+ import docx2txt
12
+ import PyPDF2
13
+ import time
9
14
from pygoogletranslation import utils , urls
10
15
from pygoogletranslation .constants import (
11
- LANGCODES , LANGUAGES
16
+ LANGCODES , LANGUAGES , RPCIDS
12
17
)
13
18
from pygoogletranslation import gauthtoken
14
19
from pygoogletranslation .models import Translated , Detected
15
20
16
- EXCLUDES = ('en' , 'ca' , 'fr' )
17
21
18
22
class Translator :
19
23
20
- def __init__ (self , host = urls .TRANSLATE , proxies = None , timeout = None ):
24
+ def __init__ (self , host = urls .TRANSLATE , proxies = None , timeout = None ,
25
+ retry = 3 , sleep = 5 , retry_messgae = False ):
21
26
self .host = host if 'http' in host else 'https://' + host
22
-
27
+ self .rpcids = RPCIDS
28
+ self .transurl = urls .TRANSLATEURL
23
29
if proxies is not None :
24
30
self .proxies = proxies
25
31
else :
@@ -28,55 +34,81 @@ def __init__(self, host=urls.TRANSLATE, proxies=None, timeout=None):
28
34
if timeout is not None :
29
35
self .timeout = timeout
30
36
31
- def translate (self , text , src = 'auto' , dest = 'en' ):
37
+ self .retry = retry
38
+ self .retry_messgae = retry_messgae
39
+ self .sleep = sleep
32
40
41
+ def translate (self , text , src = 'auto' , dest = 'en' ):
42
+ if type (text ) == list :
43
+ i = 0
44
+ for _text in text :
45
+ _text = _text .replace ('"' , '' )
46
+ _text = _text .replace ("'" , "" )
47
+ _text = _text .replace ("“" , "" )
48
+ _text = _text .replace ("”" , "" )
49
+ text [i ] = _text
50
+ i += 1
51
+ else :
52
+ text = text .replace ('"' , '' )
53
+ text = text .replace ("'" , "" )
54
+ text = text .replace ("“" , "" )
55
+ text = text .replace ("”" , "" )
56
+
33
57
if src != 'auto' :
34
- if src in LANGCODES :
35
- src_lang = LANGCODES [src ]
58
+ if src .lower () in LANGCODES :
59
+ src = LANGCODES [src ]
60
+ elif src .lower () in LANGUAGES :
61
+ src = src
36
62
else :
37
63
raise ValueError ('invalid source language' )
38
64
39
65
if dest != 'en' :
40
- if dest in LANGCODES :
41
- dest = LANGCODES [src ]
42
- elif dest in LANGUAGES :
66
+ if dest . lower () in LANGCODES :
67
+ dest = LANGCODES [src . lower () ]
68
+ elif dest . lower () in LANGUAGES :
43
69
dest = dest
44
70
else :
45
71
raise ValueError ('invalid destination language' )
46
72
47
73
data = self ._translate (text , src = src , dest = dest )
74
+ return self .extract_translation (data , text )
48
75
49
- # this code will be updated when the format is changed.
50
- translated = '' .join ([d [0 ] if d [0 ] else '' for d in data [0 ]])
51
-
52
- extra_data = self ._parse_extra_data (data )
53
-
54
- # actual source language that will be recognized by Google Translator when the
55
- # src passed is equal to auto.
56
- try :
57
- src = data [2 ]
58
- except Exception : # pragma: nocover
59
- pass
60
-
61
- pron = text
62
- try :
63
- pron = data [0 ][1 ][- 2 ]
64
- except Exception : # pragma: nocover
65
- pass
66
-
67
- if pron is None :
76
+
77
+ def extract_translation (self , _data , text , src = 'auto' , dest = 'en' ):
78
+ if type (text ) != list :
79
+ text = [text ]
80
+ result_list = []
81
+ c = 0
82
+ for data in _data :
83
+ try :
84
+ translated = data [0 ][2 ][1 ][0 ][0 ][5 ][0 ][0 ]
85
+ except :
86
+ translated = ""
87
+ extra_data = {}
68
88
try :
69
- pron = data [0 ][1 ][ 2 ]
70
- except : # pragma: nocover
89
+ src = data [0 ][2 ][ 3 ][ 5 ][ 0 ][ 0 ][ 3 ]
90
+ except Exception : # pragma: nocover
71
91
pass
72
92
73
- if dest in EXCLUDES and pron == text :
74
- pron = translated
93
+ try :
94
+ dest = data [0 ][2 ][3 ][5 ][0 ][0 ][2 ]
95
+ except Exception : # pragma: nocover
96
+ pass
75
97
76
- # put final values into a new Translated object
77
- result = Translated (src = src , dest = dest , origin = text ,
78
- text = translated , pronunciation = pron , extra_data = extra_data )
79
- return result
98
+ pron = None
99
+ try :
100
+ pron = unidecode .unidecode (data [0 ][2 ][1 ][0 ][0 ][1 ])
101
+ except Exception : # pragma: nocover
102
+ pass
103
+ # put final values into a new Translated object
104
+ result = Translated (src = src , dest = dest , origin = text [c ],
105
+ text = translated , pronunciation = pron , extra_data = extra_data )
106
+ result_list .append (result )
107
+ c += 1
108
+ if len (result_list ) == 1 :
109
+ return result_list [0 ]
110
+ else :
111
+ return result_list
80
112
81
113
def detect (self , text , ** kwargs ):
82
114
"""Detect language of the input text
@@ -116,44 +148,21 @@ def detect(self, text, **kwargs):
116
148
result .append (lang )
117
149
return result
118
150
119
- data = self ._translate (text , 'en ' , 'auto' , kwargs )
151
+ data = self ._translate (text , 'auto ' , 'en' )
120
152
121
153
# actual source language that will be recognized by Google Translator when the
122
154
# src passed is equal to auto.
123
155
src = ''
124
156
confidence = 0.0
125
157
try :
126
- src = '' . join ( data [8 ][0 ])
127
- confidence = data [8 ][- 2 ][0 ]
158
+ src = data [0 ][0 ][ 2 ][ 3 ][ 5 ][ 0 ][ 0 ][ 3 ]
159
+ # confidence = data[8][-2][0]
128
160
except Exception : # pragma: nocover
129
161
pass
130
162
result = Detected (lang = src , confidence = confidence )
131
163
132
164
return result
133
-
134
-
135
- def _parse_extra_data (self , data ):
136
- response_parts_name_mapping = {
137
- 0 : 'translation' ,
138
- 1 : 'all-translations' ,
139
- 2 : 'original-language' ,
140
- 5 : 'possible-translations' ,
141
- 6 : 'confidence' ,
142
- 7 : 'possible-mistakes' ,
143
- 8 : 'language' ,
144
- 11 : 'synonyms' ,
145
- 12 : 'definitions' ,
146
- 13 : 'examples' ,
147
- 14 : 'see-also' ,
148
- }
149
-
150
- extra = {}
151
-
152
- for index , category in response_parts_name_mapping .items ():
153
- extra [category ] = data [index ] if (index < len (data ) and data [index ]) else None
154
-
155
- return extra
156
-
165
+
157
166
def _translate (self , text , src , dest ):
158
167
""" Generate Token for each Translation and post requset to
159
168
google web api translation and return an response
@@ -162,21 +171,105 @@ def _translate(self, text, src, dest):
162
171
else other status code are consider as translation failure.
163
172
164
173
"""
165
- gtoken = gauthtoken .TokenAcquirer (proxies = self .proxies )
166
- token = gtoken .acquire (text )
167
- querystring = utils .format_querystring (token , text , src = src , dest = dest )
168
- response = requests .post (url = self .host + 't' , params = querystring , proxies = self .proxies )
169
- if response .status_code == 200 :
170
- translated_text = utils .format_json (response .content )
171
- return translated_text
172
- else :
173
- raise Exception ('Unexpected status code {} from {}' .format (response .status_code , self .host ))
174
- return False
174
+ if type (text ) != list :
175
+ text = [text ]
176
+ translated_list = []
177
+ url = self .transurl
178
+ params = utils .format_param (self .rpcids )
179
+ for _text in text :
180
+ trans_list = []
181
+ tokenized_text = utils .tokenize_sentence (_text )
182
+ for _tokenized_text in tokenized_text :
183
+ data = utils .format_data (self .rpcids , _tokenized_text , src , dest )
184
+ response = requests .request ("POST" , url , data = data , params = params , proxies = self .proxies )
185
+ if response .status_code == 200 :
186
+ _format_data = utils .format_response (str (response .text ))
187
+ trans_list .append (_format_data )
188
+ elif response .status_code == 429 :
189
+ _format_data = self .retry_request (data , params )
190
+ trans_list .append (_format_data )
191
+ else :
192
+ raise Exception ('Unexpected status code {} from {}' .format (response .status_code , self .transurl ))
193
+ return False
194
+ translated_list .append (utils .format_translation (trans_list ))
195
+ return translated_list
196
+
197
+ def retry_request (self , data , params ):
198
+ """
199
+ For bulk translation some times translation might failed
200
+ beacuse of too many attempts. for such a case before hitting
201
+ translation api wait for some time and retrying again
202
+ """
203
+ retry = self .retry
204
+ sleep = self .sleep
205
+ response = requests .request ("POST" , url = self .transurl , data = data , params = params , proxies = self .proxies )
206
+ for i in range (0 , retry ):
207
+ if response .status_code == 200 :
208
+ _format_data = utils .format_response (str (response .text ))
209
+ return _format_data
210
+ elif response .status_code == 429 :
211
+ if self .retry_messgae :
212
+ print ('retrying translation after {}s' .format (sleep ))
213
+ time .sleep (sleep )
214
+ sleep = i * sleep
215
+ else :
216
+ raise Exception ('Unexpected status code {} from {}' .format (response .status_code , self .transurl ))
217
+ return False
218
+ raise Exception ('Unexpected status code {} from {} after retried {} loop with {}s delay' .format (response .status_code , self .transurl , retry , self .sleep ))
219
+
220
+ def bulktranslate (self , file , src = 'auto' , dest = 'en' ):
221
+ """Translation from document (.doc, .docx, .pdf, .txt):
222
+ ---------------------------------------------
223
+ >>> from pygoogletranslation import Translator
224
+ >>> translator = Translator()
225
+ >>> translator.bulktranslate('test.txt', dest="ta")
226
+ # <bulk translated text>
227
+ """
228
+ if src != 'auto' :
229
+ if src .lower () in LANGCODES :
230
+ src = LANGCODES [src .lower ()]
231
+ elif src .lower () in LANGUAGES :
232
+ src = src
233
+ else :
234
+ raise ValueError ('invalid source language' )
175
235
176
-
236
+ if dest != 'en' :
237
+ if dest .lower () in LANGCODES :
238
+ dest = LANGCODES [src .lower ()]
239
+ elif dest .lower () in LANGUAGES :
240
+ dest = dest
241
+ else :
242
+ raise ValueError ('invalid destination language' )
243
+
244
+ if not os .path .exists (file ):
245
+ raise FileNotFoundError ('file {} does not exists !' .format (file ))
246
+
247
+ # Read document file, pdf file, text file
248
+ if file .endswith ('.doc' ) or file .endswith ('.docx' ):
249
+ text = docx2txt .process (file )
250
+ elif file .endswith ('.txt' ):
251
+ _file = open (file , 'r' )
252
+ text = _file .read ()
253
+ _file .close ()
254
+ elif file .endswith ('.pdf' ):
255
+ text = ''
256
+ pdfFileObj = open (file , 'rb' )
257
+ pdfReader = PyPDF2 .PdfFileReader (pdfFileObj )
258
+ for i in range (0 , pdfReader .numPages ):
259
+ pageObj = pdfReader .getPage (0 )
260
+ text += pageObj .extractText ()
261
+ pdfFileObj .close ()
262
+ else :
263
+ raise FileNotFoundError ('unsupported file format .{}.' .format (file .split ('.' ))[len (file .split ('.' ) - 1 )])
264
+ text = text .replace ('"' , '' )
265
+ text = text .replace ("'" , "" )
266
+ text = text .replace ("“" , "" )
267
+ text = text .replace ("”" , "" )
268
+ data = self ._translate (text , src = src , dest = dest )
269
+ return self .extract_translation (data , text )
270
+
177
271
def glanguage (self ):
178
272
""" Get request from google and return language and their lang codes.
179
-
180
273
Example:
181
274
>>> translate = Translator()
182
275
>>> translate.glanguage()
@@ -195,14 +288,11 @@ def glanguage(self):
195
288
"al": {}
196
289
}
197
290
"""
198
-
199
291
querystring = utils .format_querystringlang ()
200
292
response = requests .get (url = self .host + 'l' , params = querystring , proxies = self .proxies )
201
293
if response .status_code == 200 :
202
294
glang = json .loads (response .content )
203
295
return glang
204
296
else :
205
297
raise Exception ('Unexpected status code {} from {}' .format (response .status_code , self .host ))
206
- return False
207
-
208
-
298
+ return False
0 commit comments