Package translate :: Package storage :: Module lisa
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.lisa

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Parent class for LISA standards (TMX, TBX, XLIFF)""" 
 22   
 23  import re 
 24   
 25  from translate.storage import base 
 26  from translate.lang import data 
 27  try: 
 28      from lxml import etree 
 29      from translate.misc.xml_helpers import getText, getXMLlang, setXMLlang, getXMLspace, setXMLspace, namespaced 
 30  except ImportError, e: 
 31      raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.") 
 32   
 33   
34 -def _findAllMatches(text, re_obj):
35 """generate match objects for all L{re_obj} matches in L{text}.""" 36 start = 0 37 max = len(text) 38 while start < max: 39 m = re_obj.search(text, start) 40 if not m: 41 break 42 yield m 43 start = m.end()
44 45 #TODO: we can now do better with our proper placeables support 46 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)'] 47 re_placeholders = [re.compile(ph) for ph in placeholders]
48 -def _getPhMatches(text):
49 'return list of regexp matchobjects for with all place holders in the L{text}' 50 matches = [] 51 for re_ph in re_placeholders: 52 matches.extend(list(_findAllMatches(text, re_ph))) 53 54 # sort them so they come sequentially 55 matches.sort(lambda a, b: cmp(a.start(), b.start())) 56 return matches
57 58
59 -class LISAunit(base.TranslationUnit):
60 """ 61 A single unit in the file. Provisional work is done to make several 62 languages possible. 63 """ 64 65 #The name of the root element of this unit type:(termEntry, tu, trans-unit) 66 rootNode = "" 67 #The name of the per language element of this unit type:(termEntry, tu, trans-unit) 68 languageNode = "" 69 #The name of the innermost element of this unit type:(term, seg) 70 textNode = "" 71 72 namespace = None 73 _default_xml_space = "preserve" 74 """The default handling of spacing in the absense of an xml:space attribute. 75 76 This is mostly for correcting XLIFF behaviour.""" 77
78 - def __init__(self, source, empty=False, **kwargs):
79 """Constructs a unit containing the given source string""" 80 self._rich_source = None 81 self._rich_target = None 82 if empty: 83 return 84 self.xmlelement = etree.Element(self.namespaced(self.rootNode)) 85 #add descrip, note, etc. 86 super(LISAunit, self).__init__(source)
87
88 - def __eq__(self, other):
89 """Compares two units""" 90 if not isinstance(other, LISAunit): 91 return super(LISAunit, self).__eq__(other) 92 languageNodes = self.getlanguageNodes() 93 otherlanguageNodes = other.getlanguageNodes() 94 if len(languageNodes) != len(otherlanguageNodes): 95 return False 96 for i in range(len(languageNodes)): 97 mytext = self.getNodeText(languageNodes[i], getXMLspace(self.xmlelement, self._default_xml_space)) 98 othertext = other.getNodeText(otherlanguageNodes[i], getXMLspace(self.xmlelement, self._default_xml_space)) 99 if mytext != othertext: 100 #TODO:^ maybe we want to take children and notes into account 101 return False 102 return True
103
104 - def namespaced(self, name):
105 """Returns name in Clark notation. 106 107 For example namespaced("source") in an XLIFF document might return:: 108 {urn:oasis:names:tc:xliff:document:1.1}source 109 This is needed throughout lxml. 110 """ 111 return namespaced(self.namespace, name)
112
113 - def set_source_dom(self, dom_node):
114 languageNodes = self.getlanguageNodes() 115 if len(languageNodes) > 0: 116 self.xmlelement.replace(languageNodes[0], dom_node) 117 else: 118 self.xmlelement.append(dom_node)
119
120 - def get_source_dom(self):
121 return self.getlanguageNode(lang=None, index=0)
122 source_dom = property(get_source_dom, set_source_dom) 123
124 - def setsource(self, text, sourcelang='en'):
125 if self._rich_source is not None: 126 self._rich_source = None 127 text = data.forceunicode(text) 128 self.source_dom = self.createlanguageNode(sourcelang, text, "source")
129
130 - def getsource(self):
131 return self.getNodeText(self.source_dom, getXMLspace(self.xmlelement, self._default_xml_space))
132 source = property(getsource, setsource) 133
134 - def set_target_dom(self, dom_node, append=False):
135 languageNodes = self.getlanguageNodes() 136 assert len(languageNodes) > 0 137 if dom_node is not None: 138 if append or len(languageNodes) == 0: 139 self.xmlelement.append(dom_node) 140 else: 141 self.xmlelement.insert(1, dom_node) 142 if not append and len(languageNodes) > 1: 143 self.xmlelement.remove(languageNodes[1])
144
145 - def get_target_dom(self, lang=None):
146 if lang: 147 return self.getlanguageNode(lang=lang) 148 else: 149 return self.getlanguageNode(lang=None, index=1)
150 target_dom = property(get_target_dom) 151
152 - def settarget(self, text, lang='xx', append=False):
153 """Sets the "target" string (second language), or alternatively appends to the list""" 154 #XXX: we really need the language - can't really be optional, and we 155 # need to propagate it 156 if self._rich_target is not None: 157 self._rich_target = None 158 text = data.forceunicode(text) 159 #Firstly deal with reinitialising to None or setting to identical string 160 if self.gettarget() == text: 161 return 162 languageNode = self.get_target_dom(None) 163 if not text is None: 164 if languageNode is None: 165 languageNode = self.createlanguageNode(lang, text, "target") 166 self.set_target_dom(languageNode, append) 167 else: 168 if self.textNode: 169 terms = languageNode.iter(self.namespaced(self.textNode)) 170 try: 171 languageNode = terms.next() 172 except StopIteration, e: 173 pass 174 languageNode.text = text 175 else: 176 self.set_target_dom(None, False)
177
178 - def gettarget(self, lang=None):
179 """retrieves the "target" text (second entry), or the entry in the 180 specified language, if it exists""" 181 return self.getNodeText(self.get_target_dom(lang), getXMLspace(self.xmlelement, self._default_xml_space))
182 target = property(gettarget, settarget) 183
184 - def createlanguageNode(self, lang, text, purpose=None):
185 """Returns a xml Element setup with given parameters to represent a 186 single language entry. Has to be overridden.""" 187 return None
188
189 - def createPHnodes(self, parent, text):
190 """Create the text node in parent containing all the ph tags""" 191 matches = _getPhMatches(text) 192 if not matches: 193 parent.text = text 194 return 195 196 # Now we know there will definitely be some ph tags 197 start = matches[0].start() 198 pretext = text[:start] 199 if pretext: 200 parent.text = pretext 201 lasttag = parent 202 for i, m in enumerate(matches): 203 #pretext 204 pretext = text[start:m.start()] 205 # this will never happen with the first ph tag 206 if pretext: 207 lasttag.tail = pretext 208 #ph node 209 phnode = etree.SubElement(parent, self.namespaced("ph")) 210 phnode.set("id", str(i+1)) 211 phnode.text = m.group() 212 lasttag = phnode 213 start = m.end() 214 #post text 215 if text[start:]: 216 lasttag.tail = text[start:]
217
218 - def getlanguageNodes(self):
219 """Returns a list of all nodes that contain per language information.""" 220 return list(self.xmlelement.iterchildren(self.namespaced(self.languageNode)))
221
222 - def getlanguageNode(self, lang=None, index=None):
223 """Retrieves a languageNode either by language or by index""" 224 if lang is None and index is None: 225 raise KeyError("No criterea for languageNode given") 226 languageNodes = self.getlanguageNodes() 227 if lang: 228 for set in languageNodes: 229 if getXMLlang(set) == lang: 230 return set 231 else:#have to use index 232 if index >= len(languageNodes): 233 return None 234 else: 235 return languageNodes[index] 236 return None
237
238 - def getNodeText(self, languageNode, xml_space="preserve"):
239 """Retrieves the term from the given languageNode""" 240 if languageNode is None: 241 return None 242 if self.textNode: 243 terms = languageNode.iterdescendants(self.namespaced(self.textNode)) 244 if terms is None: 245 return None 246 else: 247 return getText(terms.next(), xml_space) 248 else: 249 return getText(languageNode, xml_space)
250
251 - def __str__(self):
252 return etree.tostring(self.xmlelement, pretty_print=True, encoding='utf-8')
253
254 - def _set_property(self, name, value):
255 self.xmlelement.attrib[name] = value
256 257 xid = property(lambda self: self.xmlelement.attrib[self.namespaced('xid')], 258 lambda self, value: self._set_property(self.namespaced('xid'), value)) 259 260 rid = property(lambda self: self.xmlelement.attrib[self.namespaced('rid')], 261 lambda self, value: self._set_property(self.namespaced('rid'), value)) 262
263 - def createfromxmlElement(cls, element):
264 term = cls(None, empty=True) 265 term.xmlelement = element 266 return term
267 createfromxmlElement = classmethod(createfromxmlElement)
268
269 -class LISAfile(base.TranslationStore):
270 """A class representing a file store for one of the LISA file formats.""" 271 UnitClass = LISAunit 272 #The root node of the XML document: 273 rootNode = "" 274 #The root node of the content section: 275 bodyNode = "" 276 #The XML skeleton to use for empty construction: 277 XMLskeleton = "" 278 279 namespace = None 280
281 - def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None):
282 super(LISAfile, self).__init__(unitclass=unitclass) 283 if inputfile is not None: 284 self.parse(inputfile) 285 assert self.document.getroot().tag == self.namespaced(self.rootNode) 286 else: 287 # We strip out newlines to ensure that spaces in the skeleton doesn't 288 # interfere with the the pretty printing of lxml 289 self.parse(self.XMLskeleton.replace("\n", "")) 290 self.setsourcelanguage(sourcelanguage) 291 self.settargetlanguage(targetlanguage) 292 self.addheader() 293 self._encoding = "UTF-8"
294
295 - def addheader(self):
296 """Method to be overridden to initialise headers, etc.""" 297 pass
298
299 - def namespaced(self, name):
300 """Returns name in Clark notation. 301 302 For example namespaced("source") in an XLIFF document might return:: 303 {urn:oasis:names:tc:xliff:document:1.1}source 304 This is needed throughout lxml. 305 """ 306 return namespaced(self.namespace, name)
307
308 - def initbody(self):
309 """Initialises self.body so it never needs to be retrieved from the XML again.""" 310 self.namespace = self.document.getroot().nsmap.get(None, None) 311 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
312
313 - def addsourceunit(self, source):
314 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? 315 """Adds and returns a new unit with the given string as first entry.""" 316 newunit = self.UnitClass(source) 317 self.addunit(newunit) 318 return newunit
319
320 - def addunit(self, unit, new=True):
321 unit.namespace = self.namespace 322 super(LISAfile, self).addunit(unit) 323 if new: 324 self.body.append(unit.xmlelement)
325
326 - def __str__(self):
327 """Converts to a string containing the file's XML""" 328 return etree.tostring(self.document, pretty_print=True, xml_declaration=True, encoding='utf-8')
329
330 - def parse(self, xml):
331 """Populates this object from the given xml string""" 332 if not hasattr(self, 'filename'): 333 self.filename = getattr(xml, 'name', '') 334 if hasattr(xml, "read"): 335 xml.seek(0) 336 posrc = xml.read() 337 xml = posrc 338 if etree.LXML_VERSION >= (2, 1, 0): 339 #Since version 2.1.0 we can pass the strip_cdata parameter to 340 #indicate that we don't want cdata to be converted to raw XML 341 parser = etree.XMLParser(strip_cdata=False) 342 else: 343 parser = etree.XMLParser() 344 self.document = etree.fromstring(xml, parser).getroottree() 345 self._encoding = self.document.docinfo.encoding 346 self.initbody() 347 assert self.document.getroot().tag == self.namespaced(self.rootNode) 348 for entry in self.document.getroot().iterdescendants(self.namespaced(self.UnitClass.rootNode)): 349 term = self.UnitClass.createfromxmlElement(entry) 350 self.addunit(term, new=False)
351