Package translate :: Package misc :: Module quote
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.quote

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """String processing utilities for extracting strings with various kinds 
 23  of delimiters""" 
 24   
 25  import logging 
 26  import htmlentitydefs 
 27   
 28  from translate.misc.typecheck import accepts, returns 
29 30 31 -def find_all(searchin, substr):
32 """Returns a list of locations where substr occurs in searchin 33 locations are not allowed to overlap""" 34 location = 0 35 locations = [] 36 while location != -1: 37 location = searchin.find(substr, location) 38 if location != -1: 39 locations.append(location) 40 location += len(substr) 41 return locations
42
43 44 -def extract(source, startdelim, enddelim, 45 escape=None, startinstring=False, allowreentry=True):
46 """Extracts a doublequote-delimited string from a string, allowing for 47 backslash-escaping returns tuple of (quoted string with quotes, still in 48 string at end). 49 """ 50 # Note that this returns the quote characters as well... even internally 51 instring = startinstring 52 enteredonce = False 53 lenstart = len(startdelim) 54 lenend = len(enddelim) 55 startdelim_places = find_all(source, startdelim) 56 if startdelim == enddelim: 57 enddelim_places = startdelim_places[:] 58 else: 59 enddelim_places = find_all(source, enddelim) 60 if escape is not None: 61 lenescape = len(escape) 62 escape_places = find_all(source, escape) 63 # Filter escaped escapes 64 true_escape = False 65 true_escape_places = [] 66 for escape_pos in escape_places: 67 if escape_pos - lenescape in escape_places: 68 true_escape = not true_escape 69 else: 70 true_escape = True 71 if true_escape: 72 true_escape_places.append(escape_pos) 73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 75 else: 76 enddelim_places = [pos + lenend for pos in enddelim_places] 77 # Get a unique sorted list of the significant places in the string 78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 79 significant_places.sort() 80 extracted = "" 81 lastpos = None 82 for pos in significant_places: 83 if instring and pos in enddelim_places: 84 # Make sure that if startdelim == enddelim we don't get confused 85 # and count the same string as start and end. 86 if lastpos == pos - lenstart and lastpos in startdelim_places: 87 continue 88 extracted += source[lastpos:pos] 89 instring = False 90 lastpos = pos 91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 92 instring = True 93 enteredonce = True 94 lastpos = pos 95 if instring: 96 extracted += source[lastpos:] 97 return (extracted, instring)
98
99 100 -def extractwithoutquotes(source, startdelim, enddelim, escape=None, 101 startinstring=False, includeescapes=True, 102 allowreentry=True):
103 """Extracts a doublequote-delimited string from a string, allowing for 104 backslash-escaping includeescapes can also be a function that takes the 105 whole escaped string and returns the replaced version. 106 """ 107 instring = startinstring 108 enteredonce = False 109 lenstart = len(startdelim) 110 lenend = len(enddelim) 111 startdelim_places = find_all(source, startdelim) 112 if startdelim == enddelim: 113 enddelim_places = startdelim_places[:] 114 else: 115 enddelim_places = find_all(source, enddelim) 116 #hell slow because it is called far too often 117 if escape is not None: 118 lenescape = len(escape) 119 escape_places = find_all(source, escape) 120 # filter escaped escapes 121 true_escape = False 122 true_escape_places = [] 123 for escape_pos in escape_places: 124 if escape_pos - lenescape in escape_places: 125 true_escape = not true_escape 126 else: 127 true_escape = True 128 if true_escape: 129 true_escape_places.append(escape_pos) 130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 132 else: 133 enddelim_places = [pos + lenend for pos in enddelim_places] 134 # get a unique sorted list of the significant places in the string 135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 136 significant_places.sort() 137 extracted = "" 138 lastpos = 0 139 callable_includeescapes = callable(includeescapes) 140 checkescapes = callable_includeescapes or not includeescapes 141 for pos in significant_places: 142 if instring and pos in enddelim_places and lastpos != pos - lenstart: 143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 144 section = source[section_start:section_end] 145 if escape is not None and checkescapes: 146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end] 147 new_section = "" 148 last_epos = 0 149 for epos in escape_list: 150 new_section += section[last_epos:epos] 151 if callable_includeescapes: 152 replace_escape = includeescapes(section[epos:epos + lenescape + 1]) 153 # TODO: deprecate old method of returning boolean from 154 # includeescape, by removing this if block 155 if not isinstance(replace_escape, basestring): 156 if replace_escape: 157 replace_escape = section[epos:epos + lenescape + 1] 158 else: 159 replace_escape = section[epos + lenescape:epos + lenescape + 1] 160 new_section += replace_escape 161 last_epos = epos + lenescape + 1 162 else: 163 last_epos = epos + lenescape 164 section = new_section + section[last_epos:] 165 extracted += section 166 instring = False 167 lastpos = pos 168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 169 instring = True 170 enteredonce = True 171 lastpos = pos 172 if instring: 173 section_start = lastpos + len(startdelim) 174 section = source[section_start:] 175 if escape is not None and not includeescapes: 176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos] 177 new_section = "" 178 last_epos = 0 179 for epos in escape_list: 180 new_section += section[last_epos:epos] 181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]): 182 last_epos = epos 183 else: 184 last_epos = epos + lenescape 185 section = new_section + section[last_epos:] 186 extracted += section 187 return (extracted, instring)
188
189 190 -def escapequotes(source, escapeescapes=0):
191 "Returns the same string, with double quotes escaped with backslash" 192 if escapeescapes: 193 return source.replace('\\', '\\\\').replace('"', '\\"') 194 else: 195 return source.replace('"', '\\"')
196
197 198 -def escapesinglequotes(source):
199 "Returns the same string, with single quotes doubled" 200 return source.replace("'", "''")
201
202 203 @accepts(unicode) 204 @returns(unicode) 205 -def htmlentityencode(source):
206 """encodes source using HTML entities e.g. © -> &copy;""" 207 output = u"" 208 for char in source: 209 charnum = ord(char) 210 if charnum in htmlentitydefs.codepoint2name: 211 output += u"&%s;" % htmlentitydefs.codepoint2name[charnum] 212 else: 213 output += str(char) 214 return output
215
216 217 @accepts(unicode) 218 @returns(unicode) 219 -def htmlentitydecode(source):
220 """decodes source using HTML entities e.g. &copy; -> ©""" 221 output = u"" 222 inentity = False 223 for char in source: 224 if char == "&": 225 inentity = True 226 possibleentity = "" 227 continue 228 if inentity: 229 if char == ";": 230 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint: 231 output += unichr(htmlentitydefs.name2codepoint[possibleentity]) 232 inentity = False 233 else: 234 output += "&" + possibleentity + ";" 235 inentity = False 236 elif char == " ": 237 output += "&" + possibleentity + char 238 inentity = False 239 else: 240 possibleentity += char 241 else: 242 output += char 243 return output
244
245 246 @accepts(unicode) 247 @returns(unicode) 248 -def javapropertiesencode(source):
249 """Encodes source in the escaped-unicode encoding used by Java 250 .properties files 251 """ 252 output = u"" 253 for char in source: 254 charnum = ord(char) 255 if char in controlchars: 256 output += controlchars[char] 257 elif 0 <= charnum < 128: 258 output += str(char) 259 else: 260 output += u"\\u%04X" % charnum 261 return output
262
263 264 @accepts(unicode) 265 @returns(unicode) 266 -def mozillapropertiesencode(source):
267 """Encodes source in the escaped-unicode encoding used by Mozilla 268 .properties files. 269 """ 270 output = u"" 271 for char in source: 272 if char in controlchars: 273 output += controlchars[char] 274 else: 275 output += char 276 return output
277 278 propertyescapes = { 279 # escapes that are self-escaping 280 "\\": "\\", "'": "'", '"': '"', 281 # control characters that we keep 282 "f": "\f", "n": "\n", "r": "\r", "t": "\t", 283 } 284 285 controlchars = { 286 # the reverse of the above... 287 "\\": "\\\\", 288 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t", 289 }
290 291 292 -def escapecontrols(source):
293 """escape control characters in the given string""" 294 for key, value in controlchars.iteritems(): 295 source = source.replace(key, value) 296 return source
297
298 299 @accepts(unicode) 300 @returns(unicode) 301 -def propertiesdecode(source):
302 """Decodes source from the escaped-unicode encoding used by .properties 303 files. 304 305 Java uses Latin1 by default, and Mozilla uses UTF-8 by default. 306 307 Since the .decode("unicode-escape") routine decodes everything, and we 308 don't want to we reimplemented the algorithm from Python Objects/unicode.c 309 in Python and modify it to retain escaped control characters. 310 """ 311 output = u"" 312 s = 0 313 314 def unichr2(i): 315 """Returns a Unicode string of one character with ordinal 32 <= i, 316 otherwise an escaped control character. 317 """ 318 if 32 <= i: 319 return unichr(i) 320 elif unichr(i) in controlchars: 321 # we just return the character, unescaped 322 # if people want to escape them they can use escapecontrols 323 return unichr(i) 324 else: 325 return "\\u%04x" % i
326 327 while s < len(source): 328 c = source[s] 329 if c != '\\': 330 output += c 331 s += 1 332 continue 333 s += 1 334 if s >= len(source): 335 # this is an escape at the end of the line, which implies 336 # a continuation..., return the escape to inform the parser 337 output += c 338 continue 339 c = source[s] 340 s += 1 341 if c == '\n': 342 pass 343 # propertyescapes lookups 344 elif c in propertyescapes: 345 output += propertyescapes[c] 346 # \uXXXX escapes 347 # \UXXXX escapes 348 elif c in "uU": 349 digits = 4 350 x = 0 351 for digit in range(digits): 352 x <<= 4 353 if s + digit >= len(source): 354 digits = digit 355 break 356 c = source[s + digit].lower() 357 if c.isdigit(): 358 x += ord(c) - ord('0') 359 elif c in "abcdef": 360 x += ord(c) - ord('a') + 10 361 else: 362 break 363 s += digits 364 output += unichr2(x) 365 elif c == "N": 366 if source[s] != "{": 367 logging.warn("Invalid named unicode escape: no { after \\N") 368 output += "\\" + c 369 continue 370 s += 1 371 e = source.find("}", s) 372 if e == -1: 373 logging.warn("Invalid named unicode escape: no } after \\N{") 374 output += "\\" + c 375 continue 376 import unicodedata 377 name = source[s:e] 378 output += unicodedata.lookup(name) 379 s = e + 1 380 else: 381 output += c # Drop any \ that we don't specifically handle 382 return output 383
384 385 -def quotestr(source, escapeescapes=0):
386 """Returns a doublequote-delimited quoted string, escaping double 387 quotes with backslash. 388 """ 389 if isinstance(source, list): 390 firstline = True 391 for line in source: 392 if firstline: 393 newsource = '"' + escapequotes(line, escapeescapes) + '"' 394 firstline = False 395 else: 396 newsource = newsource + '\n' + \ 397 '"' + escapequotes(line, escapeescapes) + '"' 398 return newsource 399 else: 400 return '"' + escapequotes(source, escapeescapes) + '"'
401
402 403 -def singlequotestr(source):
404 """Returns a doublequote-delimited quoted string, escaping single quotes 405 with themselves. 406 """ 407 return "'" + escapesinglequotes(source) + "'"
408
409 410 -def findend(string, substring):
411 s = string.find(substring) 412 if s != -1: 413 s += len(substring) 414 return s
415
416 417 -def rstripeol(string):
418 return string.rstrip("\r\n")
419
420 421 -def stripcomment(comment, startstring="<!--", endstring="-->"):
422 cstart = comment.find(startstring) 423 if cstart == -1: 424 cstart = 0 425 else: 426 cstart += len(startstring) 427 cend = comment.find(endstring, cstart) 428 return comment[cstart:cend].strip()
429
430 431 -def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
432 return startstring + comment.strip() + endstring
433