1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """String processing utilities for extracting strings with various kinds
23 of delimiters"""
24
25 import logging
26 import htmlentitydefs
27
28 from translate.misc.typecheck import accepts, returns
32 """Returns a list of locations where substr occurs in searchin
33 locations are not allowed to overlap"""
34 location = 0
35 locations = []
36 while location != -1:
37 location = searchin.find(substr, location)
38 if location != -1:
39 locations.append(location)
40 location += len(substr)
41 return locations
42
46 """Extracts a doublequote-delimited string from a string, allowing for
47 backslash-escaping returns tuple of (quoted string with quotes, still in
48 string at end).
49 """
50
51 instring = startinstring
52 enteredonce = False
53 lenstart = len(startdelim)
54 lenend = len(enddelim)
55 startdelim_places = find_all(source, startdelim)
56 if startdelim == enddelim:
57 enddelim_places = startdelim_places[:]
58 else:
59 enddelim_places = find_all(source, enddelim)
60 if escape is not None:
61 lenescape = len(escape)
62 escape_places = find_all(source, escape)
63
64 true_escape = False
65 true_escape_places = []
66 for escape_pos in escape_places:
67 if escape_pos - lenescape in escape_places:
68 true_escape = not true_escape
69 else:
70 true_escape = True
71 if true_escape:
72 true_escape_places.append(escape_pos)
73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
75 else:
76 enddelim_places = [pos + lenend for pos in enddelim_places]
77
78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
79 significant_places.sort()
80 extracted = ""
81 lastpos = None
82 for pos in significant_places:
83 if instring and pos in enddelim_places:
84
85
86 if lastpos == pos - lenstart and lastpos in startdelim_places:
87 continue
88 extracted += source[lastpos:pos]
89 instring = False
90 lastpos = pos
91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
92 instring = True
93 enteredonce = True
94 lastpos = pos
95 if instring:
96 extracted += source[lastpos:]
97 return (extracted, instring)
98
103 """Extracts a doublequote-delimited string from a string, allowing for
104 backslash-escaping includeescapes can also be a function that takes the
105 whole escaped string and returns the replaced version.
106 """
107 instring = startinstring
108 enteredonce = False
109 lenstart = len(startdelim)
110 lenend = len(enddelim)
111 startdelim_places = find_all(source, startdelim)
112 if startdelim == enddelim:
113 enddelim_places = startdelim_places[:]
114 else:
115 enddelim_places = find_all(source, enddelim)
116
117 if escape is not None:
118 lenescape = len(escape)
119 escape_places = find_all(source, escape)
120
121 true_escape = False
122 true_escape_places = []
123 for escape_pos in escape_places:
124 if escape_pos - lenescape in escape_places:
125 true_escape = not true_escape
126 else:
127 true_escape = True
128 if true_escape:
129 true_escape_places.append(escape_pos)
130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
132 else:
133 enddelim_places = [pos + lenend for pos in enddelim_places]
134
135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
136 significant_places.sort()
137 extracted = ""
138 lastpos = 0
139 callable_includeescapes = callable(includeescapes)
140 checkescapes = callable_includeescapes or not includeescapes
141 for pos in significant_places:
142 if instring and pos in enddelim_places and lastpos != pos - lenstart:
143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
144 section = source[section_start:section_end]
145 if escape is not None and checkescapes:
146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
147 new_section = ""
148 last_epos = 0
149 for epos in escape_list:
150 new_section += section[last_epos:epos]
151 if callable_includeescapes:
152 replace_escape = includeescapes(section[epos:epos + lenescape + 1])
153
154
155 if not isinstance(replace_escape, basestring):
156 if replace_escape:
157 replace_escape = section[epos:epos + lenescape + 1]
158 else:
159 replace_escape = section[epos + lenescape:epos + lenescape + 1]
160 new_section += replace_escape
161 last_epos = epos + lenescape + 1
162 else:
163 last_epos = epos + lenescape
164 section = new_section + section[last_epos:]
165 extracted += section
166 instring = False
167 lastpos = pos
168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
169 instring = True
170 enteredonce = True
171 lastpos = pos
172 if instring:
173 section_start = lastpos + len(startdelim)
174 section = source[section_start:]
175 if escape is not None and not includeescapes:
176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
177 new_section = ""
178 last_epos = 0
179 for epos in escape_list:
180 new_section += section[last_epos:epos]
181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]):
182 last_epos = epos
183 else:
184 last_epos = epos + lenescape
185 section = new_section + section[last_epos:]
186 extracted += section
187 return (extracted, instring)
188
191 "Returns the same string, with double quotes escaped with backslash"
192 if escapeescapes:
193 return source.replace('\\', '\\\\').replace('"', '\\"')
194 else:
195 return source.replace('"', '\\"')
196
199 "Returns the same string, with single quotes doubled"
200 return source.replace("'", "''")
201
206 """encodes source using HTML entities e.g. © -> ©"""
207 output = u""
208 for char in source:
209 charnum = ord(char)
210 if charnum in htmlentitydefs.codepoint2name:
211 output += u"&%s;" % htmlentitydefs.codepoint2name[charnum]
212 else:
213 output += str(char)
214 return output
215
220 """decodes source using HTML entities e.g. © -> ©"""
221 output = u""
222 inentity = False
223 for char in source:
224 if char == "&":
225 inentity = True
226 possibleentity = ""
227 continue
228 if inentity:
229 if char == ";":
230 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
231 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
232 inentity = False
233 else:
234 output += "&" + possibleentity + ";"
235 inentity = False
236 elif char == " ":
237 output += "&" + possibleentity + char
238 inentity = False
239 else:
240 possibleentity += char
241 else:
242 output += char
243 return output
244
249 """Encodes source in the escaped-unicode encoding used by Java
250 .properties files
251 """
252 output = u""
253 for char in source:
254 charnum = ord(char)
255 if char in controlchars:
256 output += controlchars[char]
257 elif 0 <= charnum < 128:
258 output += str(char)
259 else:
260 output += u"\\u%04X" % charnum
261 return output
262
267 """Encodes source in the escaped-unicode encoding used by Mozilla
268 .properties files.
269 """
270 output = u""
271 for char in source:
272 if char in controlchars:
273 output += controlchars[char]
274 else:
275 output += char
276 return output
277
278 propertyescapes = {
279
280 "\\": "\\", "'": "'", '"': '"',
281
282 "f": "\f", "n": "\n", "r": "\r", "t": "\t",
283 }
284
285 controlchars = {
286
287 "\\": "\\\\",
288 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t",
289 }
297
302 """Decodes source from the escaped-unicode encoding used by .properties
303 files.
304
305 Java uses Latin1 by default, and Mozilla uses UTF-8 by default.
306
307 Since the .decode("unicode-escape") routine decodes everything, and we
308 don't want to we reimplemented the algorithm from Python Objects/unicode.c
309 in Python and modify it to retain escaped control characters.
310 """
311 output = u""
312 s = 0
313
314 def unichr2(i):
315 """Returns a Unicode string of one character with ordinal 32 <= i,
316 otherwise an escaped control character.
317 """
318 if 32 <= i:
319 return unichr(i)
320 elif unichr(i) in controlchars:
321
322
323 return unichr(i)
324 else:
325 return "\\u%04x" % i
326
327 while s < len(source):
328 c = source[s]
329 if c != '\\':
330 output += c
331 s += 1
332 continue
333 s += 1
334 if s >= len(source):
335
336
337 output += c
338 continue
339 c = source[s]
340 s += 1
341 if c == '\n':
342 pass
343
344 elif c in propertyescapes:
345 output += propertyescapes[c]
346
347
348 elif c in "uU":
349 digits = 4
350 x = 0
351 for digit in range(digits):
352 x <<= 4
353 if s + digit >= len(source):
354 digits = digit
355 break
356 c = source[s + digit].lower()
357 if c.isdigit():
358 x += ord(c) - ord('0')
359 elif c in "abcdef":
360 x += ord(c) - ord('a') + 10
361 else:
362 break
363 s += digits
364 output += unichr2(x)
365 elif c == "N":
366 if source[s] != "{":
367 logging.warn("Invalid named unicode escape: no { after \\N")
368 output += "\\" + c
369 continue
370 s += 1
371 e = source.find("}", s)
372 if e == -1:
373 logging.warn("Invalid named unicode escape: no } after \\N{")
374 output += "\\" + c
375 continue
376 import unicodedata
377 name = source[s:e]
378 output += unicodedata.lookup(name)
379 s = e + 1
380 else:
381 output += c
382 return output
383
384
385 -def quotestr(source, escapeescapes=0):
386 """Returns a doublequote-delimited quoted string, escaping double
387 quotes with backslash.
388 """
389 if isinstance(source, list):
390 firstline = True
391 for line in source:
392 if firstline:
393 newsource = '"' + escapequotes(line, escapeescapes) + '"'
394 firstline = False
395 else:
396 newsource = newsource + '\n' + \
397 '"' + escapequotes(line, escapeescapes) + '"'
398 return newsource
399 else:
400 return '"' + escapequotes(source, escapeescapes) + '"'
401
404 """Returns a doublequote-delimited quoted string, escaping single quotes
405 with themselves.
406 """
407 return "'" + escapesinglequotes(source) + "'"
408
409
410 -def findend(string, substring):
411 s = string.find(substring)
412 if s != -1:
413 s += len(substring)
414 return s
415
418 return string.rstrip("\r\n")
419
429
433