Package translate :: Package lang :: Module ngram
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.ngram

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright (c) 2006 Thomas Mangin 
  5  # Copyright (c) 2009-2010 Zuza Software Foundation 
  6  # 
  7  # This program is distributed under Gnu General Public License 
  8  # (cf. the file COPYING in distribution). Alternatively, you can use 
  9  # the program under the conditions of the Artistic License (as Perl). 
 10  # 
 11  # This program is free software; you can redistribute it and/or modify 
 12  # it under the terms of the GNU General Public License as published by 
 13  # the Free Software Foundation; either version 2 of the License, or 
 14  # (at your option) any later version. 
 15  # 
 16  # This program is distributed in the hope that it will be useful, 
 17  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 18  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 19  # GNU General Public License for more details. 
 20  # 
 21  # You should have received a copy of the GNU General Public License 
 22  # along with this program; if not, write to the Free Software 
 23  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 24  # 
 25  # Orignal file from http://thomas.mangin.me.uk/data/source/ngram.py 
 26   
 27  import sys 
 28  import re 
 29  from os import path 
 30  import glob 
 31   
 32   
 33  nb_ngrams = 400 
 34  white_space_re = re.compile('\s+') 
 35   
 36   
37 -class _NGram:
38
39 - def __init__(self, arg=None):
40 if isinstance(arg, basestring): 41 self.addText(arg) 42 self.normalise() 43 elif isinstance(arg, dict): 44 # This must already be normalised! 45 self.ngrams = arg 46 else: 47 self.ngrams = dict()
48
49 - def addText(self, text):
50 if isinstance(text, str): 51 text = text.decode('utf-8') 52 53 ngrams = dict() 54 55 for word in white_space_re.split(text): 56 word = '_%s_' % word 57 size = len(word) 58 for i in xrange(size - 1): 59 for s in (1, 2, 3, 4): 60 end = i+s 61 if end >= size: 62 break 63 sub = word[i:end] 64 65 if not sub in ngrams: 66 ngrams[sub] = 0 67 ngrams[sub] += 1 68 69 self.ngrams = ngrams 70 return self
71
72 - def sorted_by_score(self):
73 sorted = [(self.ngrams[k], k) for k in self.ngrams] 74 sorted.sort() 75 sorted.reverse() 76 sorted = sorted[:nb_ngrams] 77 return sorted
78
79 - def normalise(self):
80 ngrams = {} 81 for count, (v, k) in enumerate(self.sorted_by_score()): 82 ngrams[k] = count 83 84 self.ngrams = ngrams 85 return self
86
87 - def addValues(self, key, value):
88 self.ngrams[key] = value 89 return self
90
91 - def compare(self, ngram):
92 d = 0 93 ngrams = ngram.ngrams 94 for k in self.ngrams: 95 if k in ngrams: 96 d += abs(ngrams[k] - self.ngrams[k]) 97 else: 98 d += nb_ngrams 99 return d
100 101 102
103 -class NGram:
104
105 - def __init__(self, folder, ext='.lm'):
106 self.ngrams = dict() 107 folder = path.join(folder, '*' + ext) 108 size = len(ext) 109 110 for fname in glob.glob(path.normcase(folder)): 111 lang = path.split(fname)[-1][:-size] 112 ngrams = {} 113 try: 114 f = open(fname, 'r') 115 lines = f.read().decode('utf-8').splitlines() 116 try: 117 for i, line in enumerate(lines): 118 ngram, _t, _f = line.partition(u'\t') 119 ngrams[ngram] = i 120 except AttributeError, e: 121 # Python2.4 doesn't have unicode.partition() 122 for i, line in enumerate(lines): 123 ngram = line.split(u'\t')[0] 124 ngrams[ngram] = i 125 except UnicodeDecodeError, e: 126 continue 127 128 if ngrams: 129 self.ngrams[lang] = _NGram(ngrams) 130 131 if not self.ngrams: 132 raise ValueError("no language files found")
133
134 - def classify(self, text):
135 ngram = _NGram(text) 136 r = 'guess' 137 138 min = sys.maxint 139 140 for lang in self.ngrams: 141 d = self.ngrams[lang].compare(ngram) 142 if d < min: 143 min = d 144 r = lang 145 146 if min > 0.8 * (nb_ngrams ** 2): 147 r = '' 148 return r
149 150
151 -class Generate:
152
153 - def __init__(self, folder, ext='.txt'):
154 self.ngrams = dict() 155 folder = path.join(folder, '*' + ext) 156 size = len(ext) 157 158 for fname in glob.glob(path.normcase(folder)): 159 lang = path.split(fname)[-1][:-size] 160 n = _NGram() 161 162 file = open(fname, 'r') 163 for line in file.readlines(): 164 n.addText(line) 165 file.close() 166 167 n.normalise() 168 self.ngrams[lang] = n
169
170 - def save(self, folder, ext='.lm'):
171 for lang in self.ngrams.keys(): 172 fname = path.join(folder, lang + ext) 173 file = open(fname, 'w') 174 for v, k in self.ngrams[lang].sorted_by_score(): 175 file.write("%s\t %d\n" % (k, v)) 176 file.close()
177 178 if __name__ == '__main__': 179 import sys 180 181 # Should you want to generate your own .lm files 182 #conf = Generate('/tmp') 183 #conf.save('/tmp') 184 185 text = sys.stdin.readline() 186 from translate.misc.file_discovery import get_abs_data_filename 187 l = NGram(get_abs_data_filename('langmodels')) 188 print l.classify(text) 189