Package translate :: Package storage :: Module csvl10n
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.csvl10n

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of comma-separated values (.csv) files (csvunit) 
 23  or entire files (csvfile) for use with localisation 
 24  """ 
 25   
 26  import csv 
 27  import codecs 
 28  try: 
 29      import cStringIO as StringIO 
 30  except: 
 31      import StringIO 
 32   
 33  from translate.misc import sparse 
 34  from translate.storage import base 
 35   
 36   
37 -class SimpleDictReader:
38
39 - def __init__(self, fileobj, fieldnames):
40 self.fieldnames = fieldnames 41 self.contents = fileobj.read() 42 self.parser = sparse.SimpleParser(defaulttokenlist=[",", "\n"], whitespacechars="\r") 43 self.parser.stringescaping = 0 44 self.parser.quotechars = '"' 45 self.tokens = self.parser.tokenize(self.contents) 46 self.tokenpos = 0
47
48 - def __iter__(self):
49 return self
50
51 - def getvalue(self, value):
52 """returns a value, evaluating strings as neccessary""" 53 if (value.startswith("'") and value.endswith("'")) or (value.startswith('"') and value.endswith('"')): 54 return sparse.stringeval(value) 55 else: 56 return value
57
58 - def next(self):
59 lentokens = len(self.tokens) 60 while self.tokenpos < lentokens and self.tokens[self.tokenpos] == "\n": 61 self.tokenpos += 1 62 if self.tokenpos >= lentokens: 63 raise StopIteration() 64 thistokens = [] 65 while self.tokenpos < lentokens and self.tokens[self.tokenpos] != "\n": 66 thistokens.append(self.tokens[self.tokenpos]) 67 self.tokenpos += 1 68 while self.tokenpos < lentokens and self.tokens[self.tokenpos] == "\n": 69 self.tokenpos += 1 70 fields = [] 71 # patch together fields since we can have quotes inside a field 72 currentfield = '' 73 fieldparts = 0 74 for token in thistokens: 75 if token == ',': 76 # a field is only quoted if the whole thing is quoted 77 if fieldparts == 1: 78 currentfield = self.getvalue(currentfield) 79 fields.append(currentfield) 80 currentfield = '' 81 fieldparts = 0 82 else: 83 currentfield += token 84 fieldparts += 1 85 # things after the last comma... 86 if fieldparts: 87 if fieldparts == 1: 88 currentfield = self.getvalue(currentfield) 89 fields.append(currentfield) 90 values = {} 91 for fieldnum in range(len(self.fieldnames)): 92 if fieldnum >= len(fields): 93 values[self.fieldnames[fieldnum]] = "" 94 else: 95 values[self.fieldnames[fieldnum]] = fields[fieldnum] 96 return values
97
98 -class DefaultDialect(csv.excel):
99 skipinitialspace = True 100 quoting = csv.QUOTE_NONNUMERIC 101 escapechar = '\\'
102 103 csv.register_dialect('default', DefaultDialect) 104
105 -def from_unicode(text, encoding='utf-8'):
106 if encoding == 'auto': 107 encoding = 'utf-8' 108 if isinstance(text, unicode): 109 return text.encode(encoding) 110 return text
111
112 -def to_unicode(text, encoding='utf-8'):
113 if encoding == 'auto': 114 encoding = 'utf-8' 115 if isinstance(text, unicode): 116 return text 117 return text.decode(encoding)
118
119 -class csvunit(base.TranslationUnit):
120 spreadsheetescapes = [("+", "\\+"), ("-", "\\-"), ("=", "\\="), ("'", "\\'")] 121
122 - def __init__(self, source=None):
123 super(csvunit, self).__init__(source) 124 self.location = "" 125 self.source = source or "" 126 self.target = "" 127 self.id = "" 128 self.fuzzy = 'False' 129 self.developer_comments = "" 130 self.translator_comments = "" 131 self.context = ""
132
133 - def getid(self):
134 if self.id: 135 return self.id 136 137 result = self.source 138 context = self.context 139 if context: 140 result = u"%s\04%s" % (context, result) 141 142 return result
143
144 - def setid(self, value):
145 self.id = value
146
147 - def getlocations(self):
148 #FIXME: do we need to support more than one location 149 return [self.location]
150
151 - def addlocation(self, location):
152 self.location = location
153
154 - def getcontext(self):
155 return self.context
156
157 - def setcontext(self, value):
158 self.context = value
159
160 - def getnotes(self, origin=None):
161 if origin is None: 162 result = self.translator_comments 163 if self.developer_comments: 164 if result: 165 result += '\n' + self.developer_comments 166 else: 167 result = self.developer_comments 168 return result 169 elif origin == "translator": 170 return self.translator_comments 171 elif origin in ('programmer', 'developer', 'source code'): 172 return self.developer_comments 173 else: 174 raise ValueError("Comment type not valid")
175
176 - def addnote(self, text, origin=None, position="append"):
177 if origin in ('programmer', 'developer', 'source code'): 178 if position == 'append' and self.developer_comments: 179 self.developer_comments += '\n' + text 180 elif position == 'prepend' and self.developer_comments: 181 self.developer_comments = text + '\n' + self.developer_comments 182 else: 183 self.developer_comments = text 184 else: 185 if position == 'append' and self.translator_comments: 186 self.translator_comments += '\n' + text 187 elif position == 'prepend' and self.translator_comments: 188 self.translator_comments = self.translator_comments + '\n' + text 189 else: 190 self.translator_comments = text
191
192 - def removenotes(self):
193 self.translator_comments = u''
194
195 - def isfuzzy(self):
196 if self.fuzzy.lower() in ('1', 'x', 'true', 'yes','fuzzy'): 197 return True 198 return False
199
200 - def markfuzzy(self, value=True):
201 if value: 202 self.fuzzy = 'True' 203 else: 204 self.fuzzy = 'False'
205
206 - def match_header(self):
207 """see if unit might be a header""" 208 some_value = False 209 for key, value in self.todict().iteritems(): 210 if value: 211 some_value = True 212 if key.lower() != 'fuzzy' and value and key.lower() != value.lower(): 213 return False 214 return some_value
215
216 - def add_spreadsheet_escapes(self, source, target):
217 """add common spreadsheet escapes to two strings""" 218 for unescaped, escaped in self.spreadsheetescapes: 219 if source.startswith(unescaped): 220 source = source.replace(unescaped, escaped, 1) 221 if target.startswith(unescaped): 222 target = target.replace(unescaped, escaped, 1) 223 return source, target
224
225 - def remove_spreadsheet_escapes(self, source, target):
226 """remove common spreadsheet escapes from two strings""" 227 for unescaped, escaped in self.spreadsheetescapes: 228 if source.startswith(escaped): 229 source = source.replace(escaped, unescaped, 1) 230 if target.startswith(escaped): 231 target = target.replace(escaped, unescaped, 1) 232 return source, target
233
234 - def fromdict(self, cedict, encoding='utf-8'):
235 for key, value in cedict.iteritems(): 236 rkey = fieldname_map.get(key, key) 237 if value is None or key is None or key == EXTRA_KEY: 238 continue 239 value = to_unicode(value, encoding) 240 if rkey == "id": 241 self.id = value 242 elif rkey == "source": 243 self.source = value 244 elif rkey == "target": 245 self.target = value 246 elif rkey == "location": 247 self.location = value 248 elif rkey == "fuzzy": 249 self.fuzzy = value 250 elif rkey == "context": 251 self.context = value 252 elif rkey == "translator_comments": 253 self.translator_comments = value 254 elif rkey == "developer_comments": 255 self.developer_comments = value
256 257 #self.source, self.target = self.remove_spreadsheet_escapes(self.source, self.target) 258
259 - def todict(self, encoding='utf-8'):
260 #FIXME: use apis? 261 #source, target = self.add_spreadsheet_escapes(self.source, self.target) 262 source = self.source 263 target = self.target 264 output = { 265 'location': from_unicode(self.location, encoding), 266 'source': from_unicode(source, encoding), 267 'target': from_unicode(target, encoding), 268 'id': from_unicode(self.id, encoding), 269 'fuzzy': str(self.fuzzy), 270 'context': from_unicode(self.context, encoding), 271 'translator_comments': from_unicode(self.translator_comments, encoding), 272 'developer_comments': from_unicode(self.developer_comments, encoding), 273 } 274 275 return output
276
277 - def __str__(self):
278 return str(self.todict())
279 280 canonical_field_names = ('location', 'source', 'target', 'id', 'fuzzy', 'context', 'translator_comments', 'developer_comments') 281 fieldname_map = { 282 'original': 'source', 283 'untranslated': 'source', 284 'translated': 'target', 285 'translation': 'target', 286 'identified': 'id', 287 'key': 'id', 288 'label': 'id', 289 'transaltor comments': 'translator_comments', 290 'notes': 'translator_comments', 291 'developer comments': 'developer_comments', 292 'state': 'fuzzy', 293 } 294 295 EXTRA_KEY = '__CSVL10N__EXTRA__'
296 -def try_dialects(inputfile, fieldnames, dialect):
297 #FIXME: does it verify at all if we don't actually step through the file? 298 try: 299 inputfile.seek(0) 300 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect=dialect, restkey=EXTRA_KEY) 301 except csv.Error: 302 try: 303 inputfile.seek(0) 304 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='default', restkey=EXTRA_KEY) 305 except csv.Error: 306 inputfile.seek(0) 307 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='excel', restkey=EXTRA_KEY) 308 return reader
309
310 -def valid_fieldnames(fieldnames):
311 """check if fieldnames are valid""" 312 for fieldname in fieldnames: 313 if fieldname in canonical_field_names and fieldname == 'source': 314 return True 315 elif fieldname in fieldname_map and fieldname_map[fieldname] == 'source': 316 return True 317 return False
318
319 -def detect_header(sample, dialect, fieldnames):
320 """Test if file has a header or not, also returns number of columns in first row""" 321 inputfile = StringIO.StringIO(sample) 322 try: 323 reader = csv.reader(inputfile, dialect) 324 except csv.Error: 325 try: 326 inputfile.seek(0) 327 reader = csv.reader(inputfile, 'default') 328 except csv.Error: 329 inputfile.seek(0) 330 reader = csv.reader(inputfile, 'excel') 331 332 header = reader.next() 333 columncount = max(len(header), 3) 334 if valid_fieldnames(header): 335 return header 336 return fieldnames[:columncount]
337
338 -class csvfile(base.TranslationStore):
339 """This class represents a .csv file with various lines. 340 The default format contains three columns: location, source, target""" 341 UnitClass = csvunit 342 Name = _("Comma Separated Value") 343 Mimetypes = ['text/comma-separated-values', 'text/csv'] 344 Extensions = ["csv"] 345
346 - def __init__(self, inputfile=None, fieldnames=None, encoding="auto"):
347 base.TranslationStore.__init__(self, unitclass=self.UnitClass) 348 self.units = [] 349 self.encoding = encoding or 'utf-8' 350 if not fieldnames: 351 self.fieldnames = ['location', 'source', 'target', 'id', 'fuzzy', 'context', 'translator_comments', 'developer_comments'] 352 else: 353 if isinstance(fieldnames, basestring): 354 fieldnames = [fieldname.strip() for fieldname in fieldnames.split(",")] 355 self.fieldnames = fieldnames 356 self.filename = getattr(inputfile, 'name', '') 357 self.dialect = 'default' 358 if inputfile is not None: 359 csvsrc = inputfile.read() 360 inputfile.close() 361 self.parse(csvsrc)
362 363
364 - def parse(self, csvsrc):
365 text, encoding = self.detect_encoding(csvsrc, default_encodings=['utf-8', 'utf-16']) 366 #FIXME: raise parse error if encoding detection fails? 367 if encoding and encoding.lower() != 'utf-8': 368 csvsrc = text.encode('utf-8').lstrip(codecs.BOM_UTF8) 369 self.encoding = encoding or 'utf-8' 370 371 sniffer = csv.Sniffer() 372 # FIXME: maybe we should sniff a smaller sample 373 sample = csvsrc[:1024] 374 if isinstance(sample, unicode): 375 sample = sample.encode('utf-8') 376 377 try: 378 self.dialect = sniffer.sniff(sample) 379 if not self.dialect.escapechar: 380 self.dialect.escapechar = '\\' 381 if self.dialect.quoting == csv.QUOTE_MINIMAL: 382 #HACKISH: most probably a default, not real detection 383 self.dialect.quoting = csv.QUOTE_ALL 384 self.dialect.doublequote = True 385 except csv.Error: 386 self.dialect = 'default' 387 388 try: 389 fieldnames = detect_header(sample, self.dialect, self.fieldnames) 390 self.fieldnames = fieldnames 391 except csv.Error: 392 pass 393 394 inputfile = csv.StringIO(csvsrc) 395 reader = try_dialects(inputfile, self.fieldnames, self.dialect) 396 397 #reader = SimpleDictReader(csvfile, fieldnames=fieldnames, dialect=dialect) 398 first_row = True 399 for row in reader: 400 newce = self.UnitClass() 401 newce.fromdict(row) 402 if not first_row or not newce.match_header(): 403 self.addunit(newce) 404 first_row = False
405
406 - def __str__(self):
407 """convert to a string. double check that unicode is handled somehow here""" 408 source = self.getoutput() 409 if not isinstance(source, unicode): 410 source = source.decode('utf-8') 411 if not self.encoding or self.encoding == 'auto': 412 encoding = 'utf-8' 413 else: 414 encoding = self.encoding 415 return source.encode(encoding)
416
417 - def getoutput(self):
418 outputfile = StringIO.StringIO() 419 writer = csv.DictWriter(outputfile, self.fieldnames, extrasaction='ignore', dialect=self.dialect) 420 # write header 421 hdict = dict(map(None, self.fieldnames, self.fieldnames)) 422 writer.writerow(hdict) 423 for ce in self.units: 424 cedict = ce.todict() 425 writer.writerow(cedict) 426 return outputfile.getvalue()
427