Package translate :: Package storage :: Module oo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.oo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """ 
 23  Classes that hold units of .oo files (oounit) or entire files (oofile). 
 24   
 25  These are specific .oo files for localisation exported by OpenOffice.org - SDF 
 26  format (previously knows as GSI files). For an overview of the format, see 
 27  U{http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html} 
 28   
 29  The behaviour in terms of escaping is explained in detail in the programming 
 30  comments. 
 31  """ 
 32  # FIXME: add simple test which reads in a file and writes it out again 
 33   
 34  import os 
 35  import re 
 36  import warnings 
 37   
 38  from translate.misc import quote 
 39  from translate.misc import wStringIO 
 40   
 41  # File normalisation 
 42   
 43  normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 
 44  normalizetable = "" 
 45  for i in map(chr, range(256)): 
 46      if i in normalfilenamechars: 
 47          normalizetable += i 
 48      else: 
 49          normalizetable += "_" 
 50   
 51   
52 -class unormalizechar(dict):
53
54 - def __init__(self, normalchars):
55 self.normalchars = {} 56 for char in normalchars: 57 self.normalchars[ord(char)] = char
58
59 - def __getitem__(self, key):
60 return self.normalchars.get(key, u"_")
61 62 unormalizetable = unormalizechar(normalfilenamechars.decode("ascii")) 63 64
65 -def normalizefilename(filename):
66 """converts any non-alphanumeric (standard roman) characters to _""" 67 if isinstance(filename, str): 68 return filename.translate(normalizetable) 69 else: 70 return filename.translate(unormalizetable)
71 72
73 -def makekey(ookey, long_keys):
74 """converts an oo key tuple into a unique identifier 75 76 @param ookey: an oo key 77 @type ookey: tuple 78 @param long_keys: Use long keys 79 @type long_keys: Boolean 80 @rtype: str 81 @return: unique ascii identifier 82 """ 83 project, sourcefile, resourcetype, groupid, localid, platform = ookey 84 sourcefile = sourcefile.replace('\\', '/') 85 if long_keys: 86 sourcebase = os.path.join(project, sourcefile) 87 else: 88 sourceparts = sourcefile.split('/') 89 sourcebase = "".join(sourceparts[-1:]) 90 if len(groupid) == 0 or len(localid) == 0: 91 fullid = groupid + localid 92 else: 93 fullid = groupid + "." + localid 94 if resourcetype: 95 fullid = fullid + "." + resourcetype 96 key = "%s#%s" % (sourcebase, fullid) 97 return normalizefilename(key)
98 99 # These are functions that deal with escaping and unescaping of the text fields 100 # of the SDF file. These should only be applied to the text column. 101 # The fields quickhelptext and title are assumed to carry no escaping. 102 # 103 # The escaping of all strings except those coming from .xhp (helpcontent2) 104 # sourcefiles work as follows: 105 # (newline) -> \n 106 # (carriage return) -> \r 107 # (tab) -> \t 108 # Backslash characters (\) and single quotes (') are not consistently escaped, 109 # and are therefore left as they are. 110 # 111 # For strings coming from .xhp (helpcontent2) sourcefiles the following 112 # characters are escaped inside XML tags only: 113 # < -> \< when used with lowercase tagnames (with some exceptions) 114 # > -> \> when used with lowercase tagnames (with some exceptions) 115 # " -> \" around XML properties 116 # The following is consistently escaped in .xhp strings (not only in XML tags): 117 # \ -> \\ 118 119
120 -def escape_text(text):
121 """Escapes SDF text to be suitable for unit consumption.""" 122 return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
123 124
125 -def unescape_text(text):
126 """Unescapes SDF text to be suitable for unit consumption.""" 127 return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\ 128 replace("\\r", "\r").replace("\a", "\\\\")
129 130 helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''') 131 132
133 -def escape_help_text(text):
134 """Escapes the help text as it would be in an SDF file. 135 136 <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in 137 lowercase so those are dealt with. Some OpenOffice.org help tags are not 138 escaped. 139 """ 140 text = text.replace("\\", "\\\\") 141 for tag in helptagre.findall(text): 142 escapethistag = False 143 for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]: 144 if tag.startswith("<%s" % escape_tag) or tag == "</%s>" % escape_tag: 145 escapethistag = True 146 if tag in ["<br/>", "<help-id-missing/>"]: 147 escapethistag = True 148 if escapethistag: 149 escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"') 150 text = text.replace(tag, escaped_tag) 151 return text
152 153
154 -def unescape_help_text(text):
155 """Unescapes normal text to be suitable for writing to the SDF file.""" 156 return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
157 158
159 -def encode_if_needed_utf8(text):
160 """Encode a Unicode string the the specified encoding""" 161 if isinstance(text, unicode): 162 return text.encode('UTF-8') 163 return text
164 165
166 -class ooline(object):
167 """this represents one line, one translation in an .oo file""" 168
169 - def __init__(self, parts=None):
170 """construct an ooline from its parts""" 171 if parts is None: 172 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 173 self.groupid, self.localid, self.helpid, self.platform, \ 174 self.width, self.languageid, self.text, self.helptext, \ 175 self.quickhelptext, self.title, self.timestamp = [""] * 15 176 else: 177 self.setparts(parts)
178
179 - def setparts(self, parts):
180 """create a line from its tab-delimited parts""" 181 if len(parts) != 15: 182 warnings.warn("oo line contains %d parts, it should contain 15: %r" % \ 183 (len(parts), parts)) 184 newparts = list(parts) 185 if len(newparts) < 15: 186 newparts = newparts + [""] * (15-len(newparts)) 187 else: 188 newparts = newparts[:15] 189 parts = tuple(newparts) 190 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 191 self.groupid, self.localid, self.helpid, self.platform, \ 192 self.width, self.languageid, self._text, self.helptext, \ 193 self.quickhelptext, self.title, self.timestamp = parts
194
195 - def getparts(self):
196 """return a list of parts in this line""" 197 return (self.project, self.sourcefile, self.dummy, self.resourcetype, 198 self.groupid, self.localid, self.helpid, self.platform, 199 self.width, self.languageid, self._text, self.helptext, 200 self.quickhelptext, self.title, self.timestamp)
201
202 - def gettext(self):
203 """Obtains the text column and handle escaping.""" 204 if self.sourcefile.endswith(".xhp"): 205 return unescape_help_text(self._text) 206 else: 207 return unescape_text(self._text)
208
209 - def settext(self, text):
210 """Sets the text column and handle escaping.""" 211 if self.sourcefile.endswith(".xhp"): 212 self._text = escape_help_text(text) 213 else: 214 self._text = escape_text(text)
215 text = property(gettext, settext) 216
217 - def __str__(self):
218 """convert to a string. double check that unicode is handled""" 219 return encode_if_needed_utf8(self.getoutput())
220
221 - def getoutput(self):
222 """return a line in tab-delimited form""" 223 parts = self.getparts() 224 return "\t".join(parts)
225
226 - def getkey(self):
227 """get the key that identifies the resource""" 228 return (self.project, self.sourcefile, self.resourcetype, self.groupid, 229 self.localid, self.platform)
230 231
232 -class oounit:
233 """this represents a number of translations of a resource""" 234
235 - def __init__(self):
236 """construct the oounit""" 237 self.languages = {} 238 self.lines = []
239
240 - def addline(self, line):
241 """add a line to the oounit""" 242 self.languages[line.languageid] = line 243 self.lines.append(line)
244
245 - def __str__(self):
246 """convert to a string. double check that unicode is handled""" 247 return encode_if_needed_utf8(self.getoutput())
248
249 - def getoutput(self):
250 """return the lines in tab-delimited form""" 251 return "\r\n".join([str(line) for line in self.lines])
252 253
254 -class oofile:
255 """this represents an entire .oo file""" 256 UnitClass = oounit 257
258 - def __init__(self, input=None):
259 """constructs the oofile""" 260 self.oolines = [] 261 self.units = [] 262 self.ookeys = {} 263 self.filename = "" 264 self.languages = [] 265 if input is not None: 266 self.parse(input)
267
268 - def addline(self, thisline):
269 """adds a parsed line to the file""" 270 key = thisline.getkey() 271 element = self.ookeys.get(key, None) 272 if element is None: 273 element = self.UnitClass() 274 self.units.append(element) 275 self.ookeys[key] = element 276 element.addline(thisline) 277 self.oolines.append(thisline) 278 if thisline.languageid not in self.languages: 279 self.languages.append(thisline.languageid)
280
281 - def parse(self, input):
282 """parses lines and adds them to the file""" 283 if not self.filename: 284 self.filename = getattr(input, 'name', '') 285 if hasattr(input, "read"): 286 src = input.read() 287 input.close() 288 else: 289 src = input 290 for line in src.split("\n"): 291 line = quote.rstripeol(line) 292 if not line: 293 continue 294 parts = line.split("\t") 295 thisline = ooline(parts) 296 self.addline(thisline)
297
298 - def __str__(self):
299 """convert to a string. double check that unicode is handled""" 300 return encode_if_needed_utf8(self.getoutput())
301
302 - def getoutput(self):
303 """converts all the lines back to tab-delimited form""" 304 lines = [] 305 for oe in self.units: 306 if len(oe.lines) > 2: 307 warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages)) 308 oekeys = [line.getkey() for line in oe.lines] 309 warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys)) 310 oeline = str(oe) + "\r\n" 311 lines.append(oeline) 312 return "".join(lines)
313 314
315 -class oomultifile:
316 """this takes a huge GSI file and represents it as multiple smaller files...""" 317
318 - def __init__(self, filename, mode=None, multifilestyle="single"):
319 """initialises oomultifile from a seekable inputfile or writable outputfile""" 320 self.filename = filename 321 if mode is None: 322 if os.path.exists(filename): 323 mode = 'r' 324 else: 325 mode = 'w' 326 self.mode = mode 327 self.multifilestyle = multifilestyle 328 self.multifilename = os.path.splitext(filename)[0] 329 self.multifile = open(filename, mode) 330 self.subfilelines = {} 331 if mode == "r": 332 self.createsubfileindex()
333
334 - def createsubfileindex(self):
335 """reads in all the lines and works out the subfiles""" 336 linenum = 0 337 for line in self.multifile: 338 subfile = self.getsubfilename(line) 339 if not subfile in self.subfilelines: 340 self.subfilelines[subfile] = [] 341 self.subfilelines[subfile].append(linenum) 342 linenum += 1
343
344 - def getsubfilename(self, line):
345 """looks up the subfile name for the line""" 346 if line.count("\t") < 2: 347 raise ValueError("invalid tab-delimited line: %r" % line) 348 lineparts = line.split("\t", 2) 349 module, filename = lineparts[0], lineparts[1] 350 if self.multifilestyle == "onefile": 351 ooname = self.multifilename 352 elif self.multifilestyle == "toplevel": 353 ooname = module 354 else: 355 filename = filename.replace("\\", "/") 356 fileparts = [module] + filename.split("/") 357 ooname = os.path.join(*fileparts[:-1]) 358 return ooname + os.extsep + "oo"
359
360 - def listsubfiles(self):
361 """returns a list of subfiles in the file""" 362 return self.subfilelines.keys()
363
364 - def __iter__(self):
365 """iterates through the subfile names""" 366 for subfile in self.listsubfiles(): 367 yield subfile
368
369 - def __contains__(self, pathname):
370 """checks if this pathname is a valid subfile""" 371 return pathname in self.subfilelines
372
373 - def getsubfilesrc(self, subfile):
374 """returns the list of lines matching the subfile""" 375 lines = [] 376 requiredlines = dict.fromkeys(self.subfilelines[subfile]) 377 linenum = 0 378 self.multifile.seek(0) 379 for line in self.multifile: 380 if linenum in requiredlines: 381 lines.append(line) 382 linenum += 1 383 return "".join(lines)
384
385 - def openinputfile(self, subfile):
386 """returns a pseudo-file object for the given subfile""" 387 subfilesrc = self.getsubfilesrc(subfile) 388 inputfile = wStringIO.StringIO(subfilesrc) 389 inputfile.filename = subfile 390 return inputfile
391
392 - def openoutputfile(self, subfile):
393 """returns a pseudo-file object for the given subfile""" 394 395 def onclose(contents): 396 self.multifile.write(contents) 397 self.multifile.flush()
398 outputfile = wStringIO.CatchStringOutput(onclose) 399 outputfile.filename = subfile 400 return outputfile
401
402 - def getoofile(self, subfile):
403 """returns an oofile built up from the given subfile's lines""" 404 subfilesrc = self.getsubfilesrc(subfile) 405 oosubfile = oofile() 406 oosubfile.filename = subfile 407 oosubfile.parse(subfilesrc) 408 return oosubfile
409