1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """String processing utilities for extracting strings with various kinds
23 of delimiters"""
24
25 import logging
26 import htmlentitydefs
27
28 from translate.misc.typecheck import accepts
32 """Returns a list of locations where substr occurs in searchin
33 locations are not allowed to overlap"""
34 location = 0
35 locations = []
36 while location != -1:
37 location = searchin.find(substr, location)
38 if location != -1:
39 locations.append(location)
40 location += len(substr)
41 return locations
42
46 """Extracts a doublequote-delimited string from a string, allowing for
47 backslash-escaping returns tuple of (quoted string with quotes, still in
48 string at end).
49 """
50
51 instring = startinstring
52 enteredonce = False
53 lenstart = len(startdelim)
54 lenend = len(enddelim)
55 startdelim_places = find_all(source, startdelim)
56 if startdelim == enddelim:
57 enddelim_places = startdelim_places[:]
58 else:
59 enddelim_places = find_all(source, enddelim)
60 if escape is not None:
61 lenescape = len(escape)
62 escape_places = find_all(source, escape)
63
64 true_escape = False
65 true_escape_places = []
66 for escape_pos in escape_places:
67 if escape_pos - lenescape in escape_places:
68 true_escape = not true_escape
69 else:
70 true_escape = True
71 if true_escape:
72 true_escape_places.append(escape_pos)
73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
75 else:
76 enddelim_places = [pos + lenend for pos in enddelim_places]
77
78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
79 significant_places.sort()
80 extracted = ""
81 lastpos = None
82 for pos in significant_places:
83 if instring and pos in enddelim_places:
84
85
86 if lastpos == pos - lenstart and lastpos in startdelim_places:
87 continue
88 extracted += source[lastpos:pos]
89 instring = False
90 lastpos = pos
91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
92 instring = True
93 enteredonce = True
94 lastpos = pos
95 if instring:
96 extracted += source[lastpos:]
97 return (extracted, instring)
98
103 """Extracts a doublequote-delimited string from a string, allowing for
104 backslash-escaping includeescapes can also be a function that takes the
105 whole escaped string and returns the replaced version.
106 """
107 instring = startinstring
108 enteredonce = False
109 lenstart = len(startdelim)
110 lenend = len(enddelim)
111 startdelim_places = find_all(source, startdelim)
112 if startdelim == enddelim:
113 enddelim_places = startdelim_places[:]
114 else:
115 enddelim_places = find_all(source, enddelim)
116
117 if escape is not None:
118 lenescape = len(escape)
119 escape_places = find_all(source, escape)
120
121 true_escape = False
122 true_escape_places = []
123 for escape_pos in escape_places:
124 if escape_pos - lenescape in escape_places:
125 true_escape = not true_escape
126 else:
127 true_escape = True
128 if true_escape:
129 true_escape_places.append(escape_pos)
130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
132 else:
133 enddelim_places = [pos + lenend for pos in enddelim_places]
134
135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
136 significant_places.sort()
137 extracted = ""
138 lastpos = 0
139 callable_includeescapes = callable(includeescapes)
140 checkescapes = callable_includeescapes or not includeescapes
141 for pos in significant_places:
142 if instring and pos in enddelim_places and lastpos != pos - lenstart:
143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
144 section = source[section_start:section_end]
145 if escape is not None and checkescapes:
146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
147 new_section = ""
148 last_epos = 0
149 for epos in escape_list:
150 new_section += section[last_epos:epos]
151 if callable_includeescapes:
152 replace_escape = includeescapes(section[epos:epos + lenescape + 1])
153
154
155 if not isinstance(replace_escape, basestring):
156 if replace_escape:
157 replace_escape = section[epos:epos + lenescape + 1]
158 else:
159 replace_escape = section[epos + lenescape:epos + lenescape + 1]
160 new_section += replace_escape
161 last_epos = epos + lenescape + 1
162 else:
163 last_epos = epos + lenescape
164 section = new_section + section[last_epos:]
165 extracted += section
166 instring = False
167 lastpos = pos
168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
169 instring = True
170 enteredonce = True
171 lastpos = pos
172 if instring:
173 section_start = lastpos + len(startdelim)
174 section = source[section_start:]
175 if escape is not None and not includeescapes:
176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
177 new_section = ""
178 last_epos = 0
179 for epos in escape_list:
180 new_section += section[last_epos:epos]
181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]):
182 last_epos = epos
183 else:
184 last_epos = epos + lenescape
185 section = new_section + section[last_epos:]
186 extracted += section
187 return (extracted, instring)
188
191 "Returns the same string, with double quotes escaped with backslash"
192 if escapeescapes:
193 return source.replace('\\', '\\\\').replace('"', '\\"')
194 else:
195 return source.replace('"', '\\"')
196
199 "Returns the same string, with single quotes doubled"
200 return source.replace("'", "''")
201
204 """encodes source using HTML entities e.g. © -> ©"""
205 output = ""
206 for char in source:
207 charnum = ord(char)
208 if charnum in htmlentitydefs.codepoint2name:
209 output += "&%s;" % htmlentitydefs.codepoint2name[charnum]
210 else:
211 output += str(char)
212 return output
213
216 """decodes source using HTML entities e.g. © -> ©"""
217 output = u""
218 inentity = False
219 for char in source:
220 if char == "&":
221 inentity = True
222 possibleentity = ""
223 continue
224 if inentity:
225 if char == ";":
226 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
227 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
228 inentity = False
229 else:
230 output += "&" + possibleentity + ";"
231 inentity = False
232 elif char == " ":
233 output += "&" + possibleentity + char
234 inentity = False
235 else:
236 possibleentity += char
237 else:
238 output += char
239 return output
240
243 """Encodes source in the escaped-unicode encoding used by Java
244 .properties files
245 """
246 output = u""
247 for char in source:
248 charnum = ord(char)
249 if char in controlchars:
250 output += controlchars[char]
251 elif 0 <= charnum < 128:
252 output += str(char)
253 else:
254 output += u"\\u%04X" % charnum
255 return output
256
259 """Encodes source in the escaped-unicode encoding used by Mozilla
260 .properties files.
261 """
262 output = u""
263 for char in source:
264 if char in controlchars:
265 output += controlchars[char]
266 else:
267 output += char
268 return output
269
270 propertyescapes = {
271
272 "\\": "\\", "'": "'", '"': '"',
273
274 "f": "\f", "n": "\n", "r": "\r", "t": "\t",
275 }
276
277 controlchars = {
278
279 "\\": "\\\\",
280 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t"
281 }
289
293 """Decodes source from the escaped-unicode encoding used by .properties
294 files.
295
296 Java uses Latin1 by default, and Mozilla uses UTF-8 by default.
297
298 Since the .decode("unicode-escape") routine decodes everything, and we
299 don't want to we reimplemented the algorithm from Python Objects/unicode.c
300 in Python and modify it to retain escaped control characters.
301 """
302 output = u""
303 s = 0
304 if isinstance(source, str):
305 source = source.decode(encoding)
306
307 def unichr2(i):
308 """Returns a Unicode string of one character with ordinal 32 <= i,
309 otherwise an escaped control character.
310 """
311 if 32 <= i:
312 return unichr(i)
313 elif unichr(i) in controlchars:
314
315
316 return unichr(i)
317 else:
318 return "\\u%04x" % i
319
320 while s < len(source):
321 c = source[s]
322 if c != '\\':
323 output += c
324 s += 1
325 continue
326 s += 1
327 if s >= len(source):
328
329
330 output += c
331 continue
332 c = source[s]
333 s += 1
334 if c == '\n':
335 pass
336
337 elif c in propertyescapes:
338 output += propertyescapes[c]
339
340
341 elif c in "uU":
342 digits = 4
343 x = 0
344 for digit in range(digits):
345 x <<= 4
346 if s + digit >= len(source):
347 digits = digit
348 break
349 c = source[s + digit].lower()
350 if c.isdigit():
351 x += ord(c) - ord('0')
352 elif c in "abcdef":
353 x += ord(c) - ord('a') + 10
354 else:
355 break
356 s += digits
357 output += unichr2(x)
358 elif c == "N":
359 if source[s] != "{":
360 logging.warn("Invalid named unicode escape: no { after \\N")
361 output += "\\" + c
362 continue
363 s += 1
364 e = source.find("}", s)
365 if e == -1:
366 logging.warn("Invalid named unicode escape: no } after \\N{")
367 output += "\\" + c
368 continue
369 import unicodedata
370 name = source[s:e]
371 output += unicodedata.lookup(name)
372 s = e + 1
373 else:
374 output += c
375 return output
376
377
378 -def quotestr(source, escapeescapes=0):
379 """Returns a doublequote-delimited quoted string, escaping double
380 quotes with backslash.
381 """
382 if isinstance(source, list):
383 firstline = True
384 for line in source:
385 if firstline:
386 newsource = '"' + escapequotes(line, escapeescapes) + '"'
387 firstline = False
388 else:
389 newsource = newsource + '\n' + \
390 '"' + escapequotes(line, escapeescapes) + '"'
391 return newsource
392 else:
393 return '"' + escapequotes(source, escapeescapes) + '"'
394
397 """Returns a doublequote-delimited quoted string, escaping single quotes
398 with themselves.
399 """
400 return "'" + escapesinglequotes(source) + "'"
401
402
403 -def findend(string, substring):
404 s = string.find(substring)
405 if s != -1:
406 s += len(substring)
407 return s
408
411 return string.rstrip("\r\n")
412
422
426