1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 interface for the PyLucene (v2.x) indexing engine
25
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
27 """
28
29 __revision__ = "$Id: PyLuceneIndexer.py 15717 2010-09-03 14:50:28Z alaaosh $"
30
31 import re
32 import os
33 import time
34 import logging
35
36
37
38
39 try:
40 import PyLucene
41 _COMPILER = 'gcj'
42 except ImportError:
43
44 import lucene
45 PyLucene = lucene
46 PyLucene.initVM(PyLucene.CLASSPATH)
47 _COMPILER = 'jcc'
48
49 import CommonIndexer
50
51
52 UNNAMED_FIELD_NAME = "FieldWithoutAName"
53 MAX_FIELD_SIZE = 1048576
54
55
58
59
61 """manage and use a pylucene indexing database"""
62
63 QUERY_TYPE = PyLucene.Query
64 INDEX_DIRECTORY_NAME = "lucene"
65
66 - def __init__(self, basedir, analyzer=None, create_allowed=True):
67 """initialize or open an indexing database
68
69 Any derived class must override __init__.
70
71 @raise ValueError: the given location exists, but the database type
72 is incompatible (e.g. created by a different indexing engine)
73 @raise OSError: the database failed to initialize
74
75 @param basedir: the parent directory of the database
76 @type basedir: str
77 @param analyzer: bitwise combination of possible analyzer flags
78 to be used as the default analyzer for this database. Leave it empty
79 to use the system default analyzer (self.ANALYZER_DEFAULT).
80 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
81 @type analyzer: int
82 @param create_allowed: create the database, if necessary; default: True
83 @type create_allowed: bool
84 """
85 jvm = PyLucene.getVMEnv()
86 jvm.attachCurrentThread()
87 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
88 create_allowed=create_allowed)
89 self.pyl_analyzer = PyLucene.StandardAnalyzer()
90 self.writer = None
91 self.reader = None
92 self.index_version = None
93 try:
94
95 tempreader = PyLucene.IndexReader.open(self.location)
96 tempreader.close()
97 except PyLucene.JavaError, err_msg:
98
99
100
101
102
103 if not create_allowed:
104 raise OSError("Indexer: skipping database creation")
105 try:
106
107 parent_path = os.path.dirname(self.location)
108 if not os.path.isdir(parent_path):
109
110 os.makedirs(parent_path)
111 except IOError, err_msg:
112 raise OSError("Indexer: failed to create the parent " \
113 + "directory (%s) of the indexing database: %s" \
114 % (parent_path, err_msg))
115 try:
116 tempwriter = PyLucene.IndexWriter(self.location,
117 self.pyl_analyzer, True)
118 tempwriter.close()
119 except PyLucene.JavaError, err_msg:
120 raise OSError("Indexer: failed to open or create a Lucene" \
121 + " database (%s): %s" % (self.location, err_msg))
122
123
124 numtries = 0
125
126
127 try:
128 while numtries < 10:
129 try:
130 self.reader = PyLucene.IndexReader.open(self.location)
131 self.indexVersion = self.reader.getCurrentVersion(
132 self.location)
133 self.searcher = PyLucene.IndexSearcher(self.reader)
134 break
135 except PyLucene.JavaError, e:
136
137 lock_error_msg = e
138 time.sleep(0.01)
139 numtries += 1
140 else:
141
142 raise OSError("Indexer: failed to lock index database" \
143 + " (%s)" % lock_error_msg)
144 finally:
145 pass
146
147
148 self._index_refresh()
149
151 """remove lock and close writer after loosing the last reference"""
152 jvm = PyLucene.getVMEnv()
153 jvm.attachCurrentThread()
154 self._writer_close()
155 if hasattr(self, "reader") and self.reader is not None:
156 self.reader.close()
157 self.reader = None
158 if hasattr(self, "searcher") and self.searcher is not None:
159 self.searcher.close()
160 self.searcher = None
161
162 - def flush(self, optimize=False):
163 """flush the content of the database - to force changes to be written
164 to disk
165
166 some databases also support index optimization
167
168 @param optimize: should the index be optimized if possible?
169 @type optimize: bool
170 """
171 keep_open = self._writer_is_open()
172 self._writer_open()
173 try:
174 if optimize:
175 self.writer.optimize()
176 finally:
177 self.writer.flush()
178 if not keep_open:
179 self._writer_close()
180
185
187 """generate a query based on an existing query object
188
189 basically this function should just create a copy of the original
190
191 @param query: the original query object
192 @type query: PyLucene.Query
193 @return: resulting query object
194 @rtype: PyLucene.Query
195 """
196
197
198 return query
199
202 """generate a query for a plain term of a string query
203
204 basically this function parses the string and returns the resulting
205 query
206
207 @param text: the query string
208 @type text: str
209 @param require_all: boolean operator
210 (True -> AND (default) / False -> OR)
211 @type require_all: bool
212 @param analyzer: the analyzer to be used
213 possible analyzers are:
214 - L{CommonDatabase.ANALYZER_TOKENIZE}
215 the field value is splitted to be matched word-wise
216 - L{CommonDatabase.ANALYZER_PARTIAL}
217 the field value must start with the query string
218 - L{CommonDatabase.ANALYZER_EXACT}
219 keep special characters and the like
220 @type analyzer: bool
221 @return: resulting query object
222 @rtype: PyLucene.Query
223 """
224 if analyzer is None:
225 analyzer = self.analyzer
226 if analyzer == self.ANALYZER_EXACT:
227 analyzer_obj = PyLucene.KeywordAnalyzer()
228 else:
229 text = _escape_term_value(text)
230 analyzer_obj = PyLucene.StandardAnalyzer()
231 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
232 if (analyzer & self.ANALYZER_PARTIAL > 0):
233
234 text += "*"
235 if require_all:
236 qp.setDefaultOperator(qp.Operator.AND)
237 else:
238 qp.setDefaultOperator(qp.Operator.OR)
239 return qp.parse(text)
240
242 """generate a field query
243
244 this functions creates a field->value query
245
246 @param field: the fieldname to be used
247 @type field: str
248 @param value: the wanted value of the field
249 @type value: str
250 @param analyzer: the analyzer to be used
251 possible analyzers are:
252 - L{CommonDatabase.ANALYZER_TOKENIZE}
253 the field value is splitted to be matched word-wise
254 - L{CommonDatabase.ANALYZER_PARTIAL}
255 the field value must start with the query string
256 - L{CommonDatabase.ANALYZER_EXACT}
257 keep special characters and the like
258 @type analyzer: bool
259 @return: resulting query object
260 @rtype: PyLucene.Query
261 """
262 if analyzer is None:
263 analyzer = self.analyzer
264 if analyzer == self.ANALYZER_EXACT:
265 analyzer_obj = PyLucene.KeywordAnalyzer()
266 else:
267 value = _escape_term_value(value)
268 analyzer_obj = PyLucene.StandardAnalyzer()
269 qp = PyLucene.QueryParser(field, analyzer_obj)
270 if (analyzer & self.ANALYZER_PARTIAL > 0):
271
272 value += "*"
273 return qp.parse(value)
274
276 """generate a combined query
277
278 @param queries: list of the original queries
279 @type queries: list of PyLucene.Query
280 @param require_all: boolean operator
281 (True -> AND (default) / False -> OR)
282 @type require_all: bool
283 @return: the resulting combined query object
284 @rtype: PyLucene.Query
285 """
286 combined_query = PyLucene.BooleanQuery()
287 for query in queries:
288 combined_query.add(
289 PyLucene.BooleanClause(query, _occur(require_all, False)))
290 return combined_query
291
293 """create an empty document to be filled and added to the index later
294
295 @return: the new document object
296 @rtype: PyLucene.Document
297 """
298 return PyLucene.Document()
299
301 """add a term to a document
302
303 @param document: the document to be changed
304 @type document: PyLucene.Document
305 @param term: a single term to be added
306 @type term: str
307 @param tokenize: should the term be tokenized automatically
308 @type tokenize: bool
309 """
310 if tokenize:
311 token_flag = PyLucene.Field.Index.TOKENIZED
312 else:
313 token_flag = PyLucene.Field.Index.UN_TOKENIZED
314 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
315 PyLucene.Field.Store.YES, token_flag))
316
318 """add a field term to a document
319
320 @param document: the document to be changed
321 @type document: PyLucene.Document
322 @param field: name of the field
323 @type field: str
324 @param term: term to be associated to the field
325 @type term: str
326 @param tokenize: should the term be tokenized automatically
327 @type tokenize: bool
328 """
329 if tokenize:
330 token_flag = PyLucene.Field.Index.TOKENIZED
331 else:
332 token_flag = PyLucene.Field.Index.UN_TOKENIZED
333 document.add(PyLucene.Field(str(field), term,
334 PyLucene.Field.Store.YES, token_flag))
335
337 """add a prepared document to the index database
338
339 @param document: the document to be added
340 @type document: PyLucene.Document
341 """
342 self._writer_open()
343 self.writer.addDocument(document)
344
346 """PyLucene does not support transactions
347
348 Thus this function just opens the database for write access.
349 Call "cancel_transaction" or "commit_transaction" to close write
350 access in order to remove the exclusive lock from the database
351 directory.
352 """
353 jvm = PyLucene.getVMEnv()
354 jvm.attachCurrentThread()
355 self._writer_open()
356
358 """PyLucene does not support transactions
359
360 Thus this function just closes the database write access and removes
361 the exclusive lock.
362
363 See 'start_transaction' for details.
364 """
365 if self._writer_is_open():
366 self.writer.abort()
367 self._writer_close()
368
370 """PyLucene does not support transactions
371
372 Thus this function just closes the database write access and removes
373 the exclusive lock.
374
375 See 'start_transaction' for details.
376 """
377 self._writer_close()
378 self._index_refresh()
379
381 """return an object containing the results of a query
382
383 @param query: a pre-compiled query
384 @type query: a query object of the real implementation
385 @return: an object that allows access to the results
386 @rtype: subclass of CommonEnquire
387 """
388 return PyLuceneHits(self.searcher.search(query))
389
394
396 """delete a specified document
397
398 @param docid: the document ID to be deleted
399 @type docid: int
400 """
401 if self._writer_is_open():
402 self._writer_close()
403 try:
404 self.reader.deleteDocument(docid)
405 except PyLucene.JavaError:
406 self._index_refresh()
407 self.reader.deleteDocument(docid)
408
409 - def search(self, query, fieldnames):
410 """return a list of the contents of specified fields for all matches of
411 a query
412
413 @param query: the query to be issued
414 @type query: a query object of the real implementation
415 @param fieldnames: the name(s) of a field of the document content
416 @type fieldnames: string | list of strings
417 @return: a list of dicts containing the specified field(s)
418 @rtype: list of dicts
419 """
420 if isinstance(fieldnames, basestring):
421 fieldnames = [fieldnames]
422 hits = self.searcher.search(query)
423 if _COMPILER == 'jcc':
424
425 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
426 result = []
427 for hit, doc in hits:
428 fields = {}
429 for fieldname in fieldnames:
430
431 if fieldname is None:
432 pyl_fieldname = UNNAMED_FIELD_NAME
433 else:
434 pyl_fieldname = fieldname
435 fields[fieldname] = doc.getValues(pyl_fieldname)
436 result.append(fields)
437 return result
438
440 if self.reader.isLocked(self.location):
441
442
443 try:
444
445 stat = os.stat(os.path.join(self.location, 'write.lock'))
446 age = (time.time() - stat.st_mtime) / 60
447 if age > 15:
448 logging.warning("stale lock found in %s, removing.", self.location)
449 self.reader.unlock(self.reader.directory())
450 except:
451 pass
452
454 """open write access for the indexing database and acquire an
455 exclusive lock
456 """
457 if not self._writer_is_open():
458 self._delete_stale_lock()
459 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
460 False)
461
462
463
464 if hasattr(self.writer, "setMaxFieldLength"):
465 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
466
467
469 """close indexing write access and remove the database lock"""
470 if self._writer_is_open():
471 self.writer.close()
472 self.writer = None
473
475 """check if the indexing write access is currently open"""
476 return hasattr(self, "writer") and not self.writer is None
477
479 """re-read the indexer database"""
480 try:
481 if self.reader is None or self.searcher is None:
482 self.reader = PyLucene.IndexReader.open(self.location)
483 self.searcher = PyLucene.IndexSearcher(self.reader)
484 elif self.index_version != self.reader.getCurrentVersion( \
485 self.location):
486 self.searcher.close()
487 self.reader.close()
488 self.reader = PyLucene.IndexReader.open(self.location)
489 self.searcher = PyLucene.IndexSearcher(self.reader)
490 self.index_version = self.reader.getCurrentVersion(self.location)
491 except PyLucene.JavaError, e:
492
493
494 pass
495
496
498 """an enquire object contains the information about the result of a request
499 """
500
502 """return a specified number of qualified matches of a previous query
503
504 @param start: index of the first match to return (starting from zero)
505 @type start: int
506 @param number: the number of matching entries to return
507 @type number: int
508 @return: a set of matching entries and some statistics
509 @rtype: tuple of (returned number, available number, matches)
510 "matches" is a dictionary of::
511 ["rank", "percent", "document", "docid"]
512 """
513
514
515 stop = start + number
516 if stop > self.enquire.length():
517 stop = self.enquire.length()
518
519 if stop <= start:
520 return (0, self.enquire.length(), [])
521 result = []
522 for index in range(start, stop):
523 item = {}
524 item["rank"] = index
525 item["docid"] = self.enquire.id(index)
526 item["percent"] = self.enquire.score(index)
527 item["document"] = self.enquire.doc(index)
528 result.append(item)
529 return (stop-start, self.enquire.length(), result)
530
531
532 -def _occur(required, prohibited):
533 if required == True and prohibited == False:
534 return PyLucene.BooleanClause.Occur.MUST
535 elif required == False and prohibited == False:
536 return PyLucene.BooleanClause.Occur.SHOULD
537 elif required == False and prohibited == True:
538 return PyLucene.BooleanClause.Occur.MUST_NOT
539 else:
540
541
542 return None
543
544
546 """get the installed pylucene version
547
548 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
549 @rtype: int
550 """
551 version = PyLucene.VERSION
552 if version.startswith("1."):
553 return 1
554 elif version.startswith("2."):
555 return 2
556 else:
557 return 0
558
559
562