Package translate :: Package lang :: Module identify
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.identify

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """ 
 22  This module contains functions for identifying languages based on language 
 23  models. 
 24  """ 
 25   
 26  from os import extsep, path 
 27   
 28  from translate.misc.file_discovery import get_abs_data_filename 
 29  from translate.storage.base import TranslationStore 
 30  from translate.lang.ngram import NGram 
 31   
 32   
33 -class LanguageIdentifier(object):
34 MODEL_DIR = get_abs_data_filename('langmodels') 35 """The directory containing the ngram language model files.""" 36 CONF_FILE = 'fpdb.conf' 37 """ 38 The name of the file that contains language name-code pairs 39 (relative to C{MODEL_DIR}). 40 """ 41
42 - def __init__(self, model_dir=None, conf_file=None):
43 if model_dir is None: 44 model_dir = self.MODEL_DIR 45 if not path.isdir(model_dir): 46 raise ValueError('Directory does not exist: %s' % (model_dir)) 47 48 if conf_file is None: 49 conf_file = self.CONF_FILE 50 conf_file = path.abspath(path.join(model_dir, conf_file)) 51 if not path.isfile(conf_file): 52 raise ValueError('File does not exist: %s' % (conf_file)) 53 54 self._lang_codes = {} 55 self._load_config(conf_file) 56 self.ngram = NGram(model_dir)
57
58 - def _load_config(self, conf_file):
59 """Load the mapping of language names to language codes as given in the 60 configuration file.""" 61 lines = open(conf_file).read().splitlines() 62 for line in lines: 63 parts = line.split() 64 if not parts or line.startswith('#'): 65 continue # Skip comment- and empty lines 66 lname, lcode = parts[0], parts[1] 67 68 lname = path.split(lname)[-1] # Make sure lname is not prefixed by directory names 69 if extsep in lname: 70 lname = lname[:lname.rindex(extsep)] # Remove extension if it has 71 72 # Remove trailing '[_-]-utf8' from code 73 if lcode.endswith('-utf8'): 74 lcode = lcode[:-len('-utf8')] 75 if lcode.endswith('-') or lcode.endswith('_'): 76 lcode = lcode[:-1] 77 78 self._lang_codes[lname] = lcode
79
80 - def identify_lang(self, text):
81 """Identify the language of the text in the given string.""" 82 if not text: 83 return None 84 result = self.ngram.classify(text) 85 if result in self._lang_codes: 86 result = self._lang_codes[result] 87 return result
88
89 - def identify_source_lang(self, instore):
90 """Identify the source language of the given translation store or 91 units. 92 93 @type instore: C{TranslationStore} or list or tuple of 94 C{TranslationUnit}s. 95 @param instore: The translation store to extract source text from. 96 @returns: The identified language's code or C{None} if the language 97 could not be identified.""" 98 if not isinstance(instore, (TranslationStore, list, tuple)): 99 return None 100 101 text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source) 102 if not text: 103 return None 104 return self.identify_lang(text)
105
106 - def identify_target_lang(self, instore):
107 """Identify the target language of the given translation store or 108 units. 109 110 @type instore: C{TranslationStore} or list or tuple of 111 C{TranslationUnit}s. 112 @param instore: The translation store to extract target text from. 113 @returns: The identified language's code or C{None} if the language 114 could not be identified.""" 115 if not isinstance(instore, (TranslationStore, list, tuple)): 116 return None 117 118 text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target) 119 if not text: 120 return None 121 return self.identify_lang(text)
122 123 if __name__ == "__main__": 124 from sys import argv 125 script_dir = path.abspath(path.dirname(argv[0])) 126 identifier = LanguageIdentifier(path.join(script_dir, '..', 'share', 'langmodels')) 127 import locale 128 encoding = locale.getpreferredencoding() 129 print "Language detected:", identifier.identify_lang(argv[1].decode(encoding)) 130