1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile)
23 these are specific .dtd files for localisation used by mozilla
24
25 Specifications
26 ==============
27 The following information is provided by Mozilla::
28
29 * U{Specification<http://www.w3.org/TR/REC-xml/#sec-entexpand>}
30
31 There is a grammar for entity definitions, which isn't really precise,
32 as the spec says. There's no formal specification for DTD files, it's
33 just "whatever makes this work" basically. The whole piece is clearly not
34 the strongest point of the xml spec
35
36 XML elements are allowed in entity values. A number of things that are
37 allowed will just break the resulting document, Mozilla forbids these
38 in their DTD parser.
39 """
40
41 from translate.storage import base
42 from translate.misc import quote
43
44 import re
45 import warnings
46 try:
47 from lxml import etree
48 import StringIO
49 except ImportError:
50 etree = None
51
52 labelsuffixes = (".label", ".title")
53 """Label suffixes: entries with this suffix are able to be comibed with accesskeys
54 found in in entries ending with L{accesskeysuffixes}"""
55 accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
56 """Accesskey Suffixes: entries with this suffix may be combined with labels
57 ending in L{labelsuffixes} into accelerator notation"""
58
59
68
69
82
83
85 """Find and remove ampersands that are not part of an entity definition.
86
87 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla
88 localisation this is very important and these can break the parsing of files used in XUL
89 and thus break interface rendering. Tracking down the problem is very difficult,
90 thus by removing potential broken & and warning the users we can ensure that the output
91 DTD will always be parsable.
92
93 @type name: String
94 @param name: Entity name
95 @type value: String
96 @param value: Entity text value
97 @rtype: String
98 @return: Entity value without bad ampersands
99 """
100
101 def is_valid_entity_name(name):
102 """Check that supplied L{name} is a valid entity name"""
103 if name.replace('.', '').isalnum():
104 return True
105 elif name[0] == '#' and name[1:].isalnum():
106 return True
107 return False
108
109 amppos = 0
110 invalid_amps = []
111 while amppos >= 0:
112 amppos = value.find("&", amppos)
113 if amppos != -1:
114 amppos += 1
115 semipos = value.find(";", amppos)
116 if semipos != -1:
117 if is_valid_entity_name(value[amppos:semipos]):
118 continue
119 invalid_amps.append(amppos-1)
120 if len(invalid_amps) > 0:
121 warnings.warn("invalid ampersands in dtd entity %s" % (name))
122 adjustment = 0
123 for amppos in invalid_amps:
124 value = value[:amppos-adjustment] + value[amppos-adjustment+1:]
125 adjustment += 1
126 return value
127
128
129 -class dtdunit(base.TranslationUnit):
130 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
131
133 """construct the dtdunit, prepare it for parsing"""
134 super(dtdunit, self).__init__(source)
135 self.comments = []
136 self.unparsedlines = []
137 self.incomment = False
138 self.inentity = False
139 self.entity = "FakeEntityOnlyForInitialisationAndTesting"
140 self.source = source
141 self.space_pre_entity = ' '
142 self.space_pre_definition = ' '
143
144
146 """Sets the definition to the quoted value of source"""
147 self.definition = quotefordtd(source)
148 self._rich_source = None
149
151 """gets the unquoted source string"""
152 return unquotefromdtd(self.definition)
153 source = property(getsource, setsource)
154
156 """Sets the definition to the quoted value of target"""
157 if target is None:
158 target = ""
159 self.definition = quotefordtd(target)
160 self._rich_target = None
161
163 """gets the unquoted target string"""
164 return unquotefromdtd(self.definition)
165 target = property(gettarget, settarget)
166
168 """returns whether this dtdunit doesn't actually have an entity definition"""
169
170
171 return self.entity is None
172
173 - def parse(self, dtdsrc):
174 """read the first dtd element from the source code into this object, return linesprocessed"""
175 self.comments = []
176
177 self.locfilenotes = self.comments
178 self.locgroupstarts = self.comments
179 self.locgroupends = self.comments
180 self.locnotes = self.comments
181
182
183
184
185
186 self.entity = None
187 self.definition = ''
188 if not dtdsrc:
189 return 0
190 lines = dtdsrc.split("\n")
191 linesprocessed = 0
192 comment = ""
193 for line in lines:
194 line += "\n"
195 linesprocessed += 1
196
197 if not self.incomment:
198 if (line.find('<!--') != -1):
199 self.incomment = True
200 self.continuecomment = False
201
202 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
203 if comment.find('LOCALIZATION NOTE') != -1:
204 l = quote.findend(comment, 'LOCALIZATION NOTE')
205 while (comment[l] == ' '):
206 l += 1
207 if comment.find('FILE', l) == l:
208 self.commenttype = "locfile"
209 elif comment.find('BEGIN', l) == l:
210 self.commenttype = "locgroupstart"
211 elif comment.find('END', l) == l:
212 self.commenttype = "locgroupend"
213 else:
214 self.commenttype = "locnote"
215 else:
216
217 self.commenttype = "comment"
218
219 elif not self.inentity and re.search("%.*;", line):
220
221 self.comments.append(("comment", line))
222 line = ""
223 continue
224
225 if self.incomment:
226
227 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment)
228
229 self.continuecomment = self.incomment
230
231 line = line.replace(comment, "", 1)
232
233 if not self.incomment:
234 if line.isspace():
235 comment += line
236 line = ''
237 else:
238 comment += '\n'
239
240
241
242
243
244
245
246 commentpair = (self.commenttype, comment)
247 if self.commenttype == "locfile":
248 self.locfilenotes.append(commentpair)
249 elif self.commenttype == "locgroupstart":
250 self.locgroupstarts.append(commentpair)
251 elif self.commenttype == "locgroupend":
252 self.locgroupends.append(commentpair)
253 elif self.commenttype == "locnote":
254 self.locnotes.append(commentpair)
255 elif self.commenttype == "comment":
256 self.comments.append(commentpair)
257
258 if not self.inentity and not self.incomment:
259 entitypos = line.find('<!ENTITY')
260 if entitypos != -1:
261 self.inentity = True
262 beforeentity = line[:entitypos].strip()
263 if beforeentity.startswith("#"):
264 self.hashprefix = beforeentity
265 self.entitypart = "start"
266 else:
267 self.unparsedlines.append(line)
268
269 if self.inentity:
270 if self.entitypart == "start":
271
272 e = quote.findend(line, '<!ENTITY')
273 line = line[e:]
274 self.entitypart = "name"
275 self.entitytype = "internal"
276 if self.entitypart == "name":
277 s = 0
278 e = 0
279 while (e < len(line) and line[e].isspace()):
280 e += 1
281 self.space_pre_entity = ' ' * (e - s)
282 s = e
283 self.entity = ''
284 if (e < len(line) and line[e] == '%'):
285 self.entitytype = "external"
286 self.entityparameter = ""
287 e += 1
288 while (e < len(line) and line[e].isspace()):
289 e += 1
290 while (e < len(line) and not line[e].isspace()):
291 self.entity += line[e]
292 e += 1
293 s = e
294 while (e < len(line) and line[e].isspace()):
295 e += 1
296 self.space_pre_definition = ' ' * (e - s)
297 if self.entity:
298 if self.entitytype == "external":
299 self.entitypart = "parameter"
300 else:
301 self.entitypart = "definition"
302
303 if e == len(line):
304 self.entityhelp = None
305 e = 0
306 continue
307 elif self.entitypart == "definition":
308 self.entityhelp = (e, line[e])
309 self.instring = False
310 if self.entitypart == "parameter":
311 while (e < len(line) and line[e].isspace()):
312 e += 1
313 paramstart = e
314 while (e < len(line) and line[e].isalnum()):
315 e += 1
316 self.entityparameter += line[paramstart:e]
317 while (e < len(line) and line[e].isspace()):
318 e += 1
319 line = line[e:]
320 e = 0
321 if not line:
322 continue
323 if line[0] in ('"', "'"):
324 self.entitypart = "definition"
325 self.entityhelp = (e, line[e])
326 self.instring = False
327 if self.entitypart == "definition":
328 if self.entityhelp is None:
329 e = 0
330 while (e < len(line) and line[e].isspace()):
331 e += 1
332 if e == len(line):
333 continue
334 self.entityhelp = (e, line[e])
335 self.instring = False
336
337 e = self.entityhelp[0]
338 if (self.entityhelp[1] == "'"):
339 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False)
340 elif (self.entityhelp[1] == '"'):
341 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False)
342 else:
343 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1]))
344
345 self.entityhelp = (0, self.entityhelp[1])
346 self.definition += defpart
347 if not self.instring:
348 self.inentity = False
349 break
350
351
352 if 0:
353 for attr in dir(self):
354 r = repr(getattr(self, attr))
355 if len(r) > 60:
356 r = r[:57] + "..."
357 self.comments.append(("comment", "self.%s = %s" % (attr, r)))
358 return linesprocessed
359
366
368 """convert the dtd entity back to string form"""
369 lines = []
370 lines.extend([comment for commenttype, comment in self.comments])
371 lines.extend(self.unparsedlines)
372 if self.isnull():
373 result = "".join(lines)
374 return result.rstrip() + "\n"
375
376
377
378
379 if len(self.entity) > 0:
380 if getattr(self, 'entitytype', None) == 'external':
381 entityline = '<!ENTITY % ' + self.entity + ' ' + self.entityparameter + ' ' + self.definition+'>'
382 else:
383 entityline = '<!ENTITY' + self.space_pre_entity + self.entity + self.space_pre_definition + self.definition + '>'
384 if getattr(self, 'hashprefix', None):
385 entityline = self.hashprefix + " " + entityline
386 if isinstance(entityline, unicode):
387 entityline = entityline.encode('UTF-8')
388 lines.append(entityline + '\n')
389 return "".join(lines)
390
391
392 -class dtdfile(base.TranslationStore):
393 """this class represents a .dtd file, made up of dtdunits"""
394 UnitClass = dtdunit
395
397 """construct a dtdfile, optionally reading in from inputfile"""
398 base.TranslationStore.__init__(self, unitclass=self.UnitClass)
399 self.filename = getattr(inputfile, 'name', '')
400 if inputfile is not None:
401 dtdsrc = inputfile.read()
402 self.parse(dtdsrc)
403 self.makeindex()
404
405 - def parse(self, dtdsrc):
406 """read the source code of a dtd file in and include them as dtdunits in self.units"""
407 start = 0
408 end = 0
409 lines = dtdsrc.split("\n")
410 while end < len(lines):
411 if (start == end):
412 end += 1
413 foundentity = False
414 while end < len(lines):
415 if end >= len(lines):
416 break
417 if lines[end].find('<!ENTITY') > -1:
418 foundentity = True
419 if foundentity and re.match("[\"']\s*>", lines[end]):
420 end += 1
421 break
422 end += 1
423
424
425 linesprocessed = 1
426 while linesprocessed >= 1:
427 newdtd = dtdunit()
428 try:
429 linesprocessed = newdtd.parse("\n".join(lines[start:end]))
430 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines):
431 self.units.append(newdtd)
432 except Exception, e:
433 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end])))
434 start += linesprocessed
435
437 """convert to a string. double check that unicode is handled somehow here"""
438 source = self.getoutput()
439 if not self._valid_store():
440 warnings.warn("DTD file '%s' does not validate" % self.filename)
441 return None
442 if isinstance(source, unicode):
443 return source.encode(getattr(self, "encoding", "UTF-8"))
444 return source
445
447 """convert the units back to source"""
448 sources = [str(dtd) for dtd in self.units]
449 return "".join(sources)
450
452 """makes self.index dictionary keyed on entities"""
453 self.index = {}
454 for dtd in self.units:
455 if not dtd.isnull():
456 self.index[dtd.entity] = dtd
457
459 """Validate the store to determine if it is valid
460
461 This uses ElementTree to parse the DTD
462
463 @return: If the store passes validation
464 @rtype: Boolean
465 """
466 if etree is not None:
467 try:
468
469 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput())))
470 except etree.DTDParseError:
471 return False
472 return True
473