Package translate :: Package lang :: Module common
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.common

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2008 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module contains all the common features for languages. 
 23   
 24     Supported features 
 25     ================== 
 26       - language code (km, af) 
 27       - language name (Khmer, Afrikaans) 
 28       - Plurals 
 29         - Number of plurals (nplurals) 
 30         - Plural equation 
 31       - pofilter tests to ignore 
 32   
 33     Segmentation 
 34     ------------ 
 35       - characters 
 36       - words 
 37       - sentences 
 38   
 39     TODOs and Ideas for possible features 
 40     ===================================== 
 41       - Language-Team information 
 42       - Segmentation 
 43         - phrases 
 44   
 45     Punctuation 
 46     ----------- 
 47       - End of sentence 
 48       - Start of sentence 
 49       - Middle of sentence 
 50       - Quotes 
 51         - single 
 52         - double 
 53   
 54       - Valid characters 
 55       - Accelerator characters 
 56       - Special characters 
 57       - Direction (rtl or ltr) 
 58  """ 
 59   
 60  import re 
 61   
 62  from translate.lang import data 
 63   
 64   
65 -class Common(object):
66 """This class is the common parent class for all language classes.""" 67 68 code = "" 69 """The ISO 639 language code, possibly with a country specifier or other 70 modifier. 71 72 Examples:: 73 km 74 pt_BR 75 sr_YU@Latn 76 """ 77 78 fullname = "" 79 """The full (English) name of this language. 80 81 Dialect codes should have the form of 82 - Khmer 83 - Portugese (Brazil) 84 - TODO: sr_YU@Latn? 85 """ 86 87 nplurals = 0 88 """The number of plural forms of this language. 89 90 0 is not a valid value - it must be overridden. 91 Any positive integer is valid (it should probably be between 1 and 6) 92 @see: L{data} 93 """ 94 95 pluralequation = "0" 96 """The plural equation for selection of plural forms. 97 98 This is used for PO files to fill into the header. 99 @see: U{Gettext manual<http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>} 100 @see: L{data} 101 """ 102 # Don't change these defaults of nplurals or pluralequation willy-nilly: 103 # some code probably depends on these for unrecognised languages 104 105 listseperator = u", " 106 """This string is used to separate lists of textual elements. Most 107 languages probably can stick with the default comma, but Arabic and some 108 Asian languages might want to override this.""" 109 110 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>" 111 """These punctuation marks are common in English and most languages that 112 use latin script.""" 113 114 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»" 115 """These are different quotation marks used by various languages.""" 116 117 invertedpunc = u"¿¡" 118 """Inveted punctuation sometimes used at the beginning of sentences in 119 Spanish, Asturian, Galician, and Catalan.""" 120 121 rtlpunc = u"،؟؛÷" 122 """These punctuation marks are used by Arabic and Persian, for example.""" 123 124 CJKpunc = u"。、,;!?「」『』【】" 125 """These punctuation marks are used in certain circumstances with CJK 126 languages.""" 127 128 indicpunc = u"।॥॰" 129 """These punctuation marks are used by several Indic languages.""" 130 131 ethiopicpunc = u"።፤፣" 132 """These punctuation marks are used by several Ethiopic languages.""" 133 134 miscpunc = u"…±°¹²³·©®×£¥€" 135 """The middle dot (·) is used by Greek and Georgian.""" 136 137 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\ 138 indicpunc, ethiopicpunc, miscpunc]) 139 """We include many types of punctuation here, simply since this is only 140 meant to determine if something is punctuation. Hopefully we catch some 141 languages which might not be represented with modules. Most languages won't 142 need to override this.""" 143 144 sentenceend = u".!?…։؟।。!?።" 145 """These marks can indicate a sentence end. Once again we try to account 146 for many languages. Most langauges won't need to override this.""" 147 148 #The following tries to account for a lot of things. For the best idea of 149 #what works, see test_common.py. We try to ignore abbreviations, for 150 #example, by checking that the following sentence doesn't start with lower 151 #case or numbers. 152 sentencere = re.compile(r"""(?s) #make . also match newlines 153 .*? #anything, but match non-greedy 154 [%s] #the puntuation for sentence ending 155 \s+ #the spacing after the puntuation 156 (?=[^a-z\d])#lookahead that next part starts with caps 157 """ % sentenceend, re.VERBOSE) 158 159 puncdict = {} 160 """A dictionary of punctuation transformation rules that can be used by 161 punctranslate().""" 162 163 ignoretests = [] 164 """List of pofilter tests for this language that must be ignored.""" 165 166 checker = None 167 """A language specific checker (see filters.checks). 168 169 This doesn't need to be supplied, but will be used if it exists.""" 170 171 _languages = {} 172 173 validaccel = None 174 """Characters that can be used as accelerators (access keys) i.e. Alt+X 175 where X is the accelerator. These can include combining diacritics as 176 long as they are accessible from the users keyboard in a single keystroke, 177 but normally they would be at least precomposed characters. All characters, 178 lower and upper, are included in the list.""" 179 180 validdoublewords = [] 181 """Some languages allow double words in certain cases. This is a dictionary 182 of such words.""" 183
184 - def __new__(cls, code):
185 """This returns the language class for the given code, following a 186 singleton like approach (only one object per language).""" 187 code = code or "" 188 # First see if a language object for this code already exists 189 if code in cls._languages: 190 return cls._languages[code] 191 # No existing language. Let's build a new one and keep a copy 192 language = cls._languages[code] = object.__new__(cls) 193 194 language.code = code 195 while code: 196 langdata = data.languages.get(code, None) 197 if langdata: 198 language.fullname, language.nplurals, language.pluralequation = langdata 199 break 200 code = data.simplercode(code) 201 if not code: 202 # print >> sys.stderr, "Warning: No information found about language code %s" % code 203 pass 204 return language
205
206 - def __deepcopy__(self, memo={}):
207 memo[id(self)] = self 208 return self
209
210 - def __repr__(self):
211 """Give a simple string representation without address information to 212 be able to store it in text for comparison later.""" 213 detail = "" 214 if self.code: 215 detail = "(%s)" % self.code 216 return "<class 'translate.lang.common.Common%s'>" % detail
217
218 - def punctranslate(cls, text):
219 """Converts the punctuation in a string according to the rules of the 220 language.""" 221 # TODO: look at po::escapeforpo() for performance idea 222 if not text: 223 return text 224 ellipses_end = text.endswith(u"...") 225 if ellipses_end: 226 text = text[:-3] 227 for source, target in cls.puncdict.iteritems(): 228 text = text.replace(source, target) 229 if ellipses_end: 230 if u"..." in cls.puncdict: 231 text += cls.puncdict[u"..."] 232 else: 233 text += u"..." 234 # Let's account for cases where a punctuation symbol plus a space is 235 # replaced, but the space won't exist at the end of the source message. 236 # As a simple improvement for messages ending in ellipses (...), we 237 # test that the last character is different from the second last 238 # This is only relevant if the string has two characters or more 239 if (text[-1] + u" " in cls.puncdict) and (len(text) < 2 or text[-2] != text[-1]): 240 text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip() 241 return text
242 punctranslate = classmethod(punctranslate) 243
244 - def length_difference(cls, length):
245 """Returns an estimate to a likely change in length relative to an 246 English string of length length.""" 247 # This is just a rudimentary heuristic guessing that most translations 248 # will be somewhat longer than the source language 249 expansion_factor = 0 250 code = cls.code 251 while code: 252 expansion_factor = data.expansion_factors.get(cls.code, 0) 253 if expansion_factor: 254 break 255 code = data.simplercode(code) 256 else: 257 expansion_factor = 0.1 # default 258 constant = max(5, int(40*expansion_factor)) 259 # The default: return 5 + length/10 260 return constant + int(expansion_factor * length)
261 length_difference = classmethod(length_difference) 262
263 - def alter_length(cls, text):
264 """Converts the given string by adding or removing characters as an 265 estimation of translation length (with English assumed as source 266 language).""" 267 268 def alter_it(text): 269 l = len(text) 270 if l > 9: 271 extra = cls.length_difference(l) 272 if extra > 0: 273 text = text[:extra].replace(u'\n', u'') + text 274 else: 275 text = text[-extra:] 276 return text
277 expanded = [] 278 for subtext in text.split(u"\n\n"): 279 expanded.append(alter_it(subtext)) 280 text = u"\n\n".join(expanded) 281 return text
282 alter_length = classmethod(alter_length) 283
284 - def character_iter(cls, text):
285 """Returns an iterator over the characters in text.""" 286 #We don't return more than one consecutive whitespace character 287 prev = 'A' 288 for c in text: 289 if c.isspace() and prev.isspace(): 290 continue 291 prev = c 292 if not (c in cls.punctuation): 293 yield c
294 character_iter = classmethod(character_iter) 295
296 - def characters(cls, text):
297 """Returns a list of characters in text.""" 298 return [c for c in cls.character_iter(text)]
299 characters = classmethod(characters) 300
301 - def word_iter(cls, text):
302 """Returns an iterator over the words in text.""" 303 #TODO: Consider replacing puctuation with space before split() 304 for w in text.split(): 305 word = w.strip(cls.punctuation) 306 if word: 307 yield word
308 word_iter = classmethod(word_iter) 309
310 - def words(cls, text):
311 """Returns a list of words in text.""" 312 return [w for w in cls.word_iter(text)]
313 words = classmethod(words) 314
315 - def sentence_iter(cls, text, strip=True):
316 """Returns an iterator over the sentences in text.""" 317 lastmatch = 0 318 text = text or "" 319 for item in cls.sentencere.finditer(text): 320 lastmatch = item.end() 321 sentence = item.group() 322 if strip: 323 sentence = sentence.strip() 324 if sentence: 325 yield sentence 326 remainder = text[lastmatch:] 327 if strip: 328 remainder = remainder.strip() 329 if remainder: 330 yield remainder
331 sentence_iter = classmethod(sentence_iter) 332
333 - def sentences(cls, text, strip=True):
334 """Returns a list of senteces in text.""" 335 return [s for s in cls.sentence_iter(text, strip=strip)]
336 sentences = classmethod(sentences) 337
338 - def capsstart(cls, text):
339 """Determines whether the text starts with a capital letter.""" 340 stripped = text.lstrip().lstrip(cls.punctuation) 341 return stripped and stripped[0].isupper()
342 capsstart = classmethod(capsstart) 343