Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed 
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  import re 
 32  import locale 
 33   
 34  from translate.storage import factory 
 35  from translate.storage.poheader import poheader 
 36  from translate.misc import optrecurse 
 37  from translate.misc.multistring import multistring 
 38  from translate.lang import data 
 39   
 40   
41 -class GrepMatch(object):
42 """Just a small data structure that represents a search match.""" 43 44 # INITIALIZERS #
45 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
46 self.unit = unit 47 self.part = part 48 self.part_n = part_n 49 self.start = start 50 self.end = end
51 52 # ACCESSORS #
53 - def get_getter(self):
54 if self.part == 'target': 55 if self.unit.hasplural(): 56 getter = lambda: self.unit.target.strings[self.part_n] 57 else: 58 getter = lambda: self.unit.target 59 return getter 60 elif self.part == 'source': 61 if self.unit.hasplural(): 62 getter = lambda: self.unit.source.strings[self.part_n] 63 else: 64 getter = lambda: self.unit.source 65 return getter 66 elif self.part == 'notes': 67 68 def getter(): 69 return self.unit.getnotes()[self.part_n]
70 return getter 71 elif self.part == 'locations': 72 73 def getter(): 74 return self.unit.getlocations()[self.part_n]
75 return getter 76
77 - def get_setter(self):
78 if self.part == 'target': 79 if self.unit.hasplural(): 80 81 def setter(value): 82 strings = self.unit.target.strings 83 strings[self.part_n] = value 84 self.unit.target = strings
85 else: 86 87 def setter(value): 88 self.unit.target = value 89 return setter 90 91 # SPECIAL METHODS #
92 - def __str__(self):
93 start, end = self.start, self.end 94 if start < 3: 95 start = 3 96 if end > len(self.get_getter()()) - 3: 97 end = len(self.get_getter()()) - 3 98 matchpart = self.get_getter()()[start-2:end+2] 99 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
100
101 - def __repr__(self):
102 return str(self)
103 104
105 -def real_index(string, nfc_index):
106 """Calculate the real index in the unnormalized string that corresponds to 107 the index nfc_index in the normalized string.""" 108 length = nfc_index 109 max_length = len(string) 110 while len(data.normalize(string[:length])) <= nfc_index: 111 if length == max_length: 112 return length 113 length += 1 114 return length - 1
115 116
117 -def find_matches(unit, part, strings, re_search):
118 """Return the GrepFilter objects where re_search matches in strings.""" 119 matches = [] 120 for n, string in enumerate(strings): 121 if not string: 122 continue 123 normalized = data.normalize(string) 124 for matchobj in re_search.finditer(normalized): 125 start = real_index(string, matchobj.start()) 126 end = real_index(string, matchobj.end()) 127 matches.append(GrepMatch(unit, part=part, part_n=n, start=start, end=end)) 128 return matches
129 130
131 -class GrepFilter:
132
133 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 134 invertmatch=False, accelchar=None, encoding='utf-8', 135 max_matches=0):
136 """builds a checkfilter using the given checker""" 137 if isinstance(searchstring, unicode): 138 self.searchstring = searchstring 139 else: 140 self.searchstring = searchstring.decode(encoding) 141 self.searchstring = data.normalize(self.searchstring) 142 if searchparts: 143 # For now we still support the old terminology, except for the old 'source' 144 # which has a new meaning now. 145 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 146 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 147 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 148 self.search_locations = 'locations' in searchparts 149 else: 150 self.search_source = True 151 self.search_target = True 152 self.search_notes = False 153 self.search_locations = False 154 self.ignorecase = ignorecase 155 if self.ignorecase: 156 self.searchstring = self.searchstring.lower() 157 self.useregexp = useregexp 158 if self.useregexp: 159 self.searchpattern = re.compile(self.searchstring) 160 self.invertmatch = invertmatch 161 self.accelchar = accelchar 162 self.max_matches = max_matches
163
164 - def matches(self, teststr):
165 if teststr is None: 166 return False 167 teststr = data.normalize(teststr) 168 if self.ignorecase: 169 teststr = teststr.lower() 170 if self.accelchar: 171 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 172 teststr = re.sub(self.accelchar, "", teststr) 173 if self.useregexp: 174 found = self.searchpattern.search(teststr) 175 else: 176 found = teststr.find(self.searchstring) != -1 177 if self.invertmatch: 178 found = not found 179 return found
180
181 - def filterunit(self, unit):
182 """runs filters on an element""" 183 if unit.isheader(): 184 return [] 185 186 if self.search_source: 187 if isinstance(unit.source, multistring): 188 strings = unit.source.strings 189 else: 190 strings = [unit.source] 191 for string in strings: 192 if self.matches(string): 193 return True 194 195 if self.search_target: 196 if isinstance(unit.target, multistring): 197 strings = unit.target.strings 198 else: 199 strings = [unit.target] 200 for string in strings: 201 if self.matches(string): 202 return True 203 204 if self.search_notes: 205 if self.matches(unit.getnotes()): 206 return True 207 if self.search_locations: 208 if self.matches(u" ".join(unit.getlocations())): 209 return True 210 return False
211
212 - def filterfile(self, thefile):
213 """runs filters on a translation file object""" 214 thenewfile = type(thefile)() 215 thenewfile.setsourcelanguage(thefile.sourcelanguage) 216 thenewfile.settargetlanguage(thefile.targetlanguage) 217 for unit in thefile.units: 218 if self.filterunit(unit): 219 thenewfile.addunit(unit) 220 221 if isinstance(thenewfile, poheader): 222 thenewfile.updateheader(add=True, **thefile.parseheader()) 223 return thenewfile
224
225 - def getmatches(self, units):
226 if not self.searchstring: 227 return [], [] 228 229 searchstring = self.searchstring 230 flags = re.LOCALE | re.MULTILINE | re.UNICODE 231 232 if self.ignorecase: 233 flags |= re.IGNORECASE 234 if not self.useregexp: 235 searchstring = re.escape(searchstring) 236 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 237 238 matches = [] 239 indexes = [] 240 241 for index, unit in enumerate(units): 242 old_length = len(matches) 243 244 if self.search_target: 245 if unit.hasplural(): 246 targets = unit.target.strings 247 else: 248 targets = [unit.target] 249 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 250 if self.search_source: 251 if unit.hasplural(): 252 sources = unit.source.strings 253 else: 254 sources = [unit.source] 255 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 256 if self.search_notes: 257 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 258 259 if self.search_locations: 260 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 261 262 # A search for a single letter or an all-inclusive regular 263 # expression could give enough results to cause performance 264 # problems. The answer is probably not very useful at this scale. 265 if self.max_matches and len(matches) > self.max_matches: 266 raise Exception("Too many matches found") 267 268 if len(matches) > old_length: 269 old_length = len(matches) 270 indexes.append(index) 271 272 return matches, indexes
273 274
275 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
276 """a specialized Option Parser for the grep tool...""" 277
278 - def parse_args(self, args=None, values=None):
279 """parses the command line options, handling implicit input/output args""" 280 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 281 # some intelligence as to what reasonable people might give on the command line 282 if args: 283 options.searchstring = args[0] 284 args = args[1:] 285 else: 286 self.error("At least one argument must be given for the search string") 287 if args and not options.input: 288 if not options.output: 289 options.input = args[:-1] 290 args = args[-1:] 291 else: 292 options.input = args 293 args = [] 294 if args and not options.output: 295 options.output = args[-1] 296 args = args[:-1] 297 if args: 298 self.error("You have used an invalid combination of --input, --output and freestanding args") 299 if isinstance(options.input, list) and len(options.input) == 1: 300 options.input = options.input[0] 301 return (options, args)
302
303 - def set_usage(self, usage=None):
304 """sets the usage string - if usage not given, uses getusagestring for each option""" 305 if usage is None: 306 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 307 else: 308 super(GrepOptionParser, self).set_usage(usage)
309
310 - def run(self):
311 """parses the arguments, and runs recursiveprocess with the resulting options""" 312 (options, args) = self.parse_args() 313 options.inputformats = self.inputformats 314 options.outputoptions = self.outputoptions 315 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding()) 316 self.usepsyco(options) 317 self.recursiveprocess(options)
318 319
320 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
321 """reads in inputfile, filters using checkfilter, writes to outputfile""" 322 fromfile = factory.getobject(inputfile) 323 tofile = checkfilter.filterfile(fromfile) 324 if tofile.isempty(): 325 return False 326 outputfile.write(str(tofile)) 327 return True
328 329
330 -def cmdlineparser():
331 formats = {"po": ("po", rungrep), "pot": ("pot", rungrep), 332 "mo": ("mo", rungrep), "gmo": ("gmo", rungrep), 333 "tmx": ("tmx", rungrep), 334 "xliff": ("xliff", rungrep), "xlf": ("xlf", rungrep), "xlff": ("xlff", rungrep), 335 None: ("po", rungrep)} 336 parser = GrepOptionParser(formats) 337 parser.add_option("", "--search", dest="searchparts", 338 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment"], 339 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 340 parser.add_option("-I", "--ignore-case", dest="ignorecase", 341 action="store_true", default=False, help="ignore case distinctions") 342 parser.add_option("-e", "--regexp", dest="useregexp", 343 action="store_true", default=False, help="use regular expression matching") 344 parser.add_option("-v", "--invert-match", dest="invertmatch", 345 action="store_true", default=False, help="select non-matching lines") 346 parser.add_option("", "--accelerator", dest="accelchar", 347 action="store", type="choice", choices=["&", "_", "~"], 348 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 349 parser.set_usage() 350 parser.passthrough.append('checkfilter') 351 parser.description = __doc__ 352 return parser
353 354
355 -def main():
356 parser = cmdlineparser() 357 parser.run()
358 359 360 if __name__ == '__main__': 361 main() 362