1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Manage the Wordfast Translation Memory format
22
23 Wordfast TM format is the Translation Memory format used by the
24 U{Wordfast<http://www.wordfast.net/>} computer aided translation tool.
25
26 It is a bilingual base class derived format with L{WordfastTMFile}
27 and L{WordfastUnit} providing file and unit level access.
28
29 Wordfast tools
30 ==============
31 Wordfast is a computer aided translation tool. It is an application
32 built on top of Microsoft Word and is implemented as a rather
33 sophisticated set of macros. Understanding that helps us understand
34 many of the seemingly strange choices around this format including:
35 encoding, escaping and file naming.
36
37 Implementation
38 ==============
39 The implementation covers the full requirements of a Wordfast TM file.
40 The files are simple Tab Separated Value (TSV) files that can be read
41 by Microsoft Excel and other spreadsheet programs. They use the .txt
42 extension which does make it more difficult to automatically identify
43 such files.
44
45 The dialect of the TSV files is specified by L{WordfastDialect}.
46
47 Encoding
48 --------
49 The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices
50 are most likely because Microsoft Word is the base editing tool for
51 Wordfast.
52
53 The format is tab separated so we are able to detect UTF-16 vs Latin-1
54 by searching for the occurance of a UTF-16 tab character and then
55 continuing with the parsing.
56
57 Timestamps
58 ----------
59 L{WordfastTime} allows for the correct management of the Wordfast
60 YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are
61 not updated when edited.
62
63 Header
64 ------
65 L{WordfastHeader} provides header management support. The header
66 functionality is fully implemented through observing the behaviour of the
67 files in real use cases, input from the Wordfast programmers and
68 public documentation.
69
70 Escaping
71 --------
72 Wordfast TM implements a form of escaping that covers two aspects:
73 1. Placeable: bold, formating, etc. These are left as is and ignored.
74 It is up to the editor and future placeable implementation to manage
75 these.
76 2. Escapes: items that may confuse Excel or translators are
77 escaped as &'XX;. These are fully implemented and are converted to
78 and from Unicode. By observing behaviour and reading documentation
79 we where able to observe all possible escapes. Unfortunately the
80 escaping differs slightly between Windows and Mac version. This
81 might cause errors in future.
82 Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to
83 Wordfast escapes<_char_to_wf>}.
84
85 Extended Attributes
86 -------------------
87 The last 4 columns allow users to define and manage extended attributes.
88 These are left as is and are not directly managed byour implemenation.
89 """
90
91 import csv
92 import sys
93 import time
94
95 from translate.storage import base
96
97 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
98 """Time format used by Wordfast"""
99
100 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version",
101 "target-lang", "license", "attr1list", "attr2list",
102 "attr3list", "attr4list", "attr5list"]
103 """Field names for the Wordfast header"""
104
105 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang",
106 "target", "attr1", "attr2", "attr3", "attr4"]
107 """Field names for a Wordfast TU"""
108
109 WF_FIELDNAMES_HEADER_DEFAULTS = {
110 "date": "%19000101~121212",
111 "userlist": "%User ID,TT,TT Translate-Toolkit",
112 "tucount": "%TU=00000001",
113 "src-lang": "%EN-US",
114 "version": "%Wordfast TM v.5.51w9/00",
115 "target-lang": "",
116 "license": "%---00000001",
117 "attr1list": "",
118 "attr2list": "",
119 "attr3list": "",
120 "attr4list": ""}
121 """Default or minimum header entries for a Wordfast file"""
122
123
124
125
126
127 WF_ESCAPE_MAP = (
128 ("&'26;", u"\u0026"),
129
130 ("&'82;", u"\u201A"),
131 ("&'85;", u"\u2026"),
132 ("&'91;", u"\u2018"),
133 ("&'92;", u"\u2019"),
134 ("&'93;", u"\u201C"),
135 ("&'94;", u"\u201D"),
136 ("&'96;", u"\u2013"),
137 ("&'97;", u"\u2014"),
138 ("&'99;", u"\u2122"),
139
140 ("&'A0;", u"\u00A0"),
141 ("&'A9;", u"\u00A9"),
142 ("&'AE;", u"\u00AE"),
143 ("&'BC;", u"\u00BC"),
144 ("&'BD;", u"\u00BD"),
145 ("&'BE;", u"\u00BE"),
146
147 ("&'A8;", u"\u00AE"),
148 ("&'AA;", u"\u2122"),
149 ("&'C7;", u"\u00AB"),
150 ("&'C8;", u"\u00BB"),
151 ("&'C9;", u"\u2026"),
152 ("&'CA;", u"\u00A0"),
153 ("&'D0;", u"\u2013"),
154 ("&'D1;", u"\u2014"),
155 ("&'D2;", u"\u201C"),
156 ("&'D3;", u"\u201D"),
157 ("&'D4;", u"\u2018"),
158 ("&'D5;", u"\u2019"),
159 ("&'E2;", u"\u201A"),
160 ("&'E3;", u"\u201E"),
161
162
163
164
165 )
166 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
167
168 TAB_UTF16 = "\x00\x09"
169 """The tab \\t character as it would appear in UTF-16 encoding"""
170
171
173 """Char -> Wordfast &'XX; escapes
174
175 Full roundtripping is not possible because of the escaping of
176 NEWLINE \\n and TAB \\t"""
177
178
179 if string:
180 for code, char in WF_ESCAPE_MAP:
181 string = string.replace(char.encode('utf-8'), code)
182 string = string.replace("\n", "\\n").replace("\t", "\\t")
183 return string
184
185
193
194
209 csv.register_dialect("wordfast", WordfastDialect)
210
211
213 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
214
216 self._time = None
217 if not newtime:
218 self.time = None
219 elif isinstance(newtime, basestring):
220 self.timestring = newtime
221 elif isinstance(newtime, time.struct_time):
222 self.time = newtime
223
225 """Get the time in the Wordfast time format"""
226 if not self._time:
227 return None
228 else:
229 return time.strftime(WF_TIMEFORMAT, self._time)
230
232 """Set the time_sturct object using a Wordfast time formated string
233
234 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
235 @type timestring: String
236 """
237 self._time = time.strptime(timestring, WF_TIMEFORMAT)
238 timestring = property(get_timestring, set_timestring)
239
241 """Get the time_struct object"""
242 return self._time
243
245 """Set the time_struct object
246
247 @param newtime: a new time object
248 @type newtime: time.time_struct
249 """
250 if newtime and isinstance(newtime, time.struct_time):
251 self._time = newtime
252 else:
253 self._time = None
254 time = property(get_time, set_time)
255
261
262
264 """A wordfast translation memory header"""
265
272
279
281 """Get the header dictionary"""
282 return self._header_dict
283
285 self._header_dict = newheader
286 header = property(getheader, setheader)
287
289 self._header_dict['target-lang'] = '%%%s' % newlang
290 targetlang = property(None, settargetlang)
291
293 self._header_dict['tucount'] = '%%TU=%08d' % count
294 tucount = property(None, settucount)
295
296
298 """A Wordfast translation memory unit"""
299
305
309
311 """Get the dictionary of values for a Wordfast line"""
312 return self._dict
313
315 """Set the dictionary of values for a Wordfast line
316
317 @param newdict: a new dictionary with Wordfast line elements
318 @type newdict: Dict
319 """
320
321 self._dict = newdict
322 dict = property(getdict, setdict)
323
325 if self._dict.get(key, None) is None:
326 return None
327 elif self._dict[key]:
328 return _wf_to_char(self._dict[key]).decode('utf-8')
329 else:
330 return ""
331
333 if newvalue is None:
334 self._dict[key] = None
335 if isinstance(newvalue, unicode):
336 newvalue = newvalue.encode('utf-8')
337 newvalue = _char_to_wf(newvalue)
338 if not key in self._dict or newvalue != self._dict[key]:
339 self._dict[key] = newvalue
340 self._update_timestamp()
341
344
348 source = property(getsource, setsource)
349
352
356 target = property(gettarget, settarget)
357
359 self._dict['target-lang'] = newlang
360 targetlang = property(None, settargetlang)
361
363 return str(self._dict)
364
366 if not self._dict.get('source', None):
367 return False
368 return bool(self._dict.get('target', None))
369
370
372 """A Wordfast translation memory file"""
373 Name = _("Wordfast Translation Memory")
374 Mimetypes = ["application/x-wordfast"]
375 Extensions = ["txt"]
376
378 """construct a Wordfast TM, optionally reading in from inputfile."""
379 self.UnitClass = unitclass
380 base.TranslationStore.__init__(self, unitclass=unitclass)
381 self.filename = ''
382 self.header = WordfastHeader()
383 self._encoding = 'iso-8859-1'
384 if inputfile is not None:
385 self.parse(inputfile)
386
388 """parsese the given file or file source string"""
389 if hasattr(input, 'name'):
390 self.filename = input.name
391 elif not getattr(self, 'filename', ''):
392 self.filename = ''
393 if hasattr(input, "read"):
394 tmsrc = input.read()
395 input.close()
396 input = tmsrc
397 if TAB_UTF16 in input.split("\n")[0]:
398 self._encoding = 'utf-16'
399 else:
400 self._encoding = 'iso-8859-1'
401 try:
402 input = input.decode(self._encoding).encode('utf-8')
403 except:
404 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
405 for header in csv.DictReader(input.split("\n")[:1],
406 fieldnames=WF_FIELDNAMES_HEADER,
407 dialect="wordfast"):
408 self.header = WordfastHeader(header)
409 lines = csv.DictReader(input.split("\n")[1:],
410 fieldnames=WF_FIELDNAMES,
411 dialect="wordfast")
412 for line in lines:
413 newunit = WordfastUnit()
414 newunit.dict = line
415 self.addunit(newunit)
416
418 output = csv.StringIO()
419 header_output = csv.StringIO()
420 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES,
421 dialect="wordfast")
422 unit_count = 0
423 for unit in self.units:
424 if unit.istranslated():
425 unit_count += 1
426 writer.writerow(unit.dict)
427 if unit_count == 0:
428 return ""
429 output.reset()
430 self.header.tucount = unit_count
431 outheader = csv.DictWriter(header_output,
432 fieldnames=WF_FIELDNAMES_HEADER,
433 dialect="wordfast")
434 outheader.writerow(self.header.header)
435 header_output.reset()
436 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
437 try:
438 return decoded.encode(self._encoding)
439 except UnicodeEncodeError:
440 return decoded.encode('utf-16')
441