1 package org.kit.furia.index;
2
3 import java.io.ByteArrayInputStream;
4 import java.io.File;
5 import java.io.IOException;
6 import java.util.Iterator;
7 import java.util.LinkedList;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.PriorityQueue;
11
12 import org.ajmm.obsearch.Index;
13 import org.ajmm.obsearch.OB;
14 import org.ajmm.obsearch.Result;
15 import org.ajmm.obsearch.exception.AlreadyFrozenException;
16 import org.ajmm.obsearch.exception.IllegalIdException;
17 import org.ajmm.obsearch.exception.OBException;
18 import org.ajmm.obsearch.exception.OutOfRangeException;
19 import org.ajmm.obsearch.exception.UndefinedPivotsException;
20 import org.apache.log4j.Logger;
21 import org.apache.lucene.analysis.WhitespaceAnalyzer;
22 import org.apache.lucene.document.Field;
23 import org.apache.lucene.index.CorruptIndexException;
24 import org.apache.lucene.index.IndexReader;
25 import org.apache.lucene.index.IndexWriter;
26 import org.apache.lucene.index.Term;
27 import org.apache.lucene.index.TermDocs;
28 import org.apache.lucene.index.TermFreqVector;
29 import org.apache.lucene.search.BooleanClause;
30 import org.apache.lucene.search.BooleanQuery;
31 import org.apache.lucene.search.DefaultSimilarity;
32 import org.apache.lucene.search.Hit;
33 import org.apache.lucene.search.HitCollector;
34 import org.apache.lucene.search.Hits;
35 import org.apache.lucene.search.IndexSearcher;
36 import org.apache.lucene.search.Query;
37 import org.apache.lucene.search.Searcher;
38 import org.apache.lucene.search.Similarity;
39 import org.apache.lucene.search.TermQuery;
40 import org.apache.lucene.search.similar.MoreLikeThis;
41 import org.apache.lucene.search.similar.MoreLikeThisQuery;
42 import org.apache.lucene.search.similar.SimilarityQueries;
43 import org.kit.furia.Document;
44 import org.kit.furia.IRIndex;
45 import org.kit.furia.ResultCandidate;
46 import org.kit.furia.Document.DocumentElement;
47 import org.kit.furia.exceptions.IRException;
48
49 import com.sleepycat.bind.tuple.TupleInput;
50 import com.sleepycat.bind.tuple.TupleOutput;
51 import com.sleepycat.je.DatabaseException;
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 public abstract class AbstractIRIndex < O extends OB > implements IRIndex < O > {
86
87
88
89
90 private static final Logger logger = Logger
91 .getLogger(AbstractIRIndex.class);
92
93
94
95
96
97
98 protected enum FieldName {
99
100
101
102
103 WORDS,
104
105 DOC_NAME,
106
107 MULTI_SET_SIZE,
108
109
110 DOC_DETAILS,
111
112
113 SET_SIZE,
114 }
115
116
117
118
119 protected IndexWriter indexWriter;
120
121
122
123
124 protected IndexReader indexReader;
125
126
127
128
129 protected Searcher searcher;
130
131
132
133
134
135 protected float mSetScoreThreshold = 0.32f;
136
137
138
139
140 protected float setScoreThreshold = 0.04f;
141
142
143
144
145 protected boolean validationMode = false;
146
147
148
149
150
151
152
153
154
155
156
157
158
159 public AbstractIRIndex(File dbFolder) throws IOException {
160 indexWriter = new IndexWriter(dbFolder, new WhitespaceAnalyzer());
161 indexReader = IndexReader.open(dbFolder);
162 searcher = new IndexSearcher(indexReader);
163 }
164
165 public int delete(String documentName) throws IRException {
166 int res = 0;
167 try {
168 res = indexReader.deleteDocuments(new Term(FieldName.DOC_NAME
169 .toString(), documentName));
170 } catch (Exception e) {
171 throw new IRException(e);
172 }
173 return res;
174 }
175
176
177
178
179
180
181
182
183 public boolean shouldSkipDoc(Document<O> x) throws IOException{
184
185
186
187
188 return this.indexReader.docFreq(new Term(FieldName.DOC_NAME.toString(), x.getName()) )< 1;
189 }
190
191
192
193
194
195 protected ResultCandidate calculateSimilarity( org.apache.lucene.document.Document document , Map < Integer, Integer > normalizedQuery, float score){
196 String docName = document.getField(
197 FieldName.DOC_NAME.toString()).stringValue();
198 TupleInput in = new TupleInput(document.getField(
199 FieldName.MULTI_SET_SIZE.toString()).binaryValue());
200
201 int docSizeMSet = in.readInt();
202
203 int docSizeSet = new TupleInput(document.getField(
204 FieldName.SET_SIZE.toString()).binaryValue())
205 .readInt();
206
207 int cx = 0;
208 int intersectionQueryMSet = 0;
209 int intersectionQuerySet = 0;
210 TupleInput freqs = new TupleInput(document.getField(
211 FieldName.DOC_DETAILS.toString()).binaryValue());
212 while (cx < docSizeSet) {
213 int wordId = freqs.readInt();
214 int frecuency = freqs.readInt();
215 Integer frequencyQuery = normalizedQuery.get(wordId);
216 if (frequencyQuery != null) {
217 intersectionQuerySet++;
218 intersectionQueryMSet += Math.min(frecuency,
219 frequencyQuery);
220 }
221 cx++;
222 }
223 ResultCandidate can = new ResultCandidate(docName, score,
224 intersectionQueryMSet, docSizeMSet,
225 intersectionQuerySet, docSizeSet);
226 return can;
227 }
228
229 private class FHitCollector extends HitCollector{
230
231 PriorityQueue < ResultCandidate > candidates;
232 Map < Integer, Integer > normalizedQuery;
233 private boolean found = false;
234 private String queryName;
235 private ResultCandidate match;
236 public FHitCollector(Map < Integer, Integer > normalizedQuery, String queryName){
237 this.candidates = new PriorityQueue < ResultCandidate >();
238 this.normalizedQuery = normalizedQuery;
239 this.queryName = queryName;
240 }
241
242
243
244 public ResultCandidate getMatch() {
245 return match;
246 }
247
248
249
250 public boolean isFound() {
251 return found;
252 }
253
254 public void collect(int doc, float score) {
255
256 org.apache.lucene.document.Document document = null;
257 try{
258 document = searcher.doc(doc);
259 }catch(IOException e){
260 logger.fatal("This is bad", e);
261 candidates =null;
262 normalizedQuery = null;
263 assert false;
264 }
265
266 ResultCandidate can = calculateSimilarity(document, normalizedQuery, score);
267 if(validationMode){
268 String docName = document.getField(
269 FieldName.DOC_NAME.toString()).stringValue();
270 if(docName.equals(queryName)){
271 found = true;
272 logger.debug("Collector: Found :)" + can.toString());
273 }
274
275
276
277
278 }
279
280 if(shouldAddToTheFinalResult(can)){
281 candidates.add(can);
282 }
283
284 }
285
286 private boolean shouldAddToTheFinalResult(ResultCandidate can){
287 return can.getNaiveScoreMSet() >= mSetScoreThreshold && can.getNaiveScoreSet() >= setScoreThreshold;
288 }
289
290
291
292 public List<ResultCandidate> getResults(){
293 List<ResultCandidate> res = new LinkedList<ResultCandidate>();
294 ResultCandidate cur;
295 int total = 0;
296 while (((cur = candidates.poll()) != null)) {
297 res.add(cur);
298 }
299 return res;
300 }
301 }
302
303
304
305
306
307 public int getSize(){
308 return this.indexReader.numDocs();
309 }
310
311 protected List < ResultCandidate > processQueryResults(
312 Map < Integer, Integer > normalizedQuery, short n, Document query)
313 throws IRException {
314 List < ResultCandidate > res = new LinkedList < ResultCandidate >();
315 try {
316
317
318
319
320
321
322
323
324
325
326 if (normalizedQuery.size() > 0) {
327 PriorityQueue < Word > sorted = createPriorityQueue(normalizedQuery);
328 Query q = createQuery(sorted);
329 FHitCollector collector = new FHitCollector(normalizedQuery, query.getName());
330
331 searcher.search(q,collector);
332
333
334
335
336
337
338
339
340
341
342
343
344 return(collector.getResults());
345 }
346 }catch (IOException e) {
347 throw new IRException(e);
348 }
349 return res;
350 }
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434 private StringBuilder repeat(int x, int times) {
435 int i = 0;
436 assert times > 0;
437 StringBuilder res = new StringBuilder();
438 while (i < times) {
439 res.append(Integer.toString(x));
440 res.append(" ");
441 i++;
442 }
443 return res;
444 }
445
446
447
448 public void insert(Document < O > document) throws IRException {
449
450 assert document.size() != 0;
451 try {
452 Index < O > index = getIndex();
453 Iterator < Document < O >.DocumentElement < O > > it = document
454 .iterator();
455
456 StringBuilder contents = new StringBuilder();
457 int i = 0;
458 long prevTime = System.currentTimeMillis();
459
460 TupleOutput mSetOut = new TupleOutput();
461
462 while (it.hasNext()) {
463 Document < O >.DocumentElement < O > elem = it.next();
464 Result res = index.insert(elem.getObject());
465 assert res.getStatus() == Result.Status.OK
466 || res.getStatus() == Result.Status.EXISTS;
467 contents.append(repeat(res.getId(), elem.getCount()));
468 mSetOut.writeInt(res.getId());
469 mSetOut.writeInt(elem.getCount());
470 i++;
471 }
472
473
474
475 Field contentsField = new Field(FieldName.WORDS.toString(),
476 contents.toString(), Field.Store.NO, Field.Index.TOKENIZED);
477 Field docName = new Field(FieldName.DOC_NAME.toString(), document
478 .getName(), Field.Store.YES, Field.Index.UN_TOKENIZED);
479 TupleOutput out = new TupleOutput();
480 out.writeInt(document.multiSetSize());
481 Field multiSetSize = new Field(FieldName.MULTI_SET_SIZE.toString(),
482 out.getBufferBytes(), Field.Store.YES);
483
484 Field docDetails = new Field(FieldName.DOC_DETAILS.toString(),
485 mSetOut.getBufferBytes(), Field.Store.YES);
486
487 TupleOutput out2 = new TupleOutput();
488 out2.writeInt(document.size());
489 Field setSize = new Field(FieldName.SET_SIZE.toString(), out2
490 .getBufferBytes(), Field.Store.YES);
491
492 org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
493 doc.add(contentsField);
494 doc.add(docName);
495 doc.add(multiSetSize);
496 doc.add(setSize);
497 doc.add(docDetails);
498 indexWriter.addDocument(doc);
499 } catch (Exception e) {
500 throw new IRException(e);
501 }
502 }
503
504 public void freeze() throws IRException {
505 try {
506 logger.debug("Freezing index, OB objects: "
507 + this.getIndex().databaseSize());
508 getIndex().freeze();
509 logger.debug("Optimizing Lucene");
510 indexWriter.optimize();
511 } catch (Exception e) {
512 logger.fatal("Error in freeze: ", e);
513 throw new IRException(e);
514 }
515 }
516
517 public void close() throws IRException {
518 try {
519 getIndex().close();
520 indexWriter.close();
521 indexReader.close();
522 } catch (Exception e) {
523 throw new IRException(e);
524 }
525 }
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572 private Query createQuery(PriorityQueue < Word > q) {
573 BooleanQuery query = new BooleanQuery(true);
574
575
576 query.setAllowDocsOutOfOrder(true);
577
578 BooleanQuery.setMaxClauseCount(q.size());
579 boolean boost = true;
580 Word cur;
581 int qterms = 0;
582
583 float bestScore = 0;
584
585 while (((cur = q.poll()) != null)) {
586 TermQuery tq = new TermQuery(new Term(FieldName.WORDS.toString(),
587 cur.getId().toString()));
588
589
590
591
592
593
594 query.add(tq, BooleanClause.Occur.SHOULD);
595 qterms++;
596 }
597 return query;
598 }
599
600
601
602
603
604
605
606
607
608 protected PriorityQueue < Word > createPriorityQueue(
609 Map < Integer, Integer > words) throws IOException {
610
611 assert words.size() > 0;
612 int numDocs = this.indexReader.numDocs();
613 PriorityQueue < Word > res = new PriorityQueue < Word >(words.size());
614 Similarity similarity = new DefaultSimilarity();
615 Iterator < Integer > it = words.keySet().iterator();
616 while (it.hasNext()) {
617 Integer wordId = it.next();
618 Integer repetition = words.get(wordId);
619 int tf = repetition;
620
621 int docFreq = indexReader.docFreq(new Term(FieldName.WORDS
622 .toString(), wordId.toString()));
623
624 if (docFreq == 0) {
625 continue;
626 }
627
628 float idf = similarity.idf(docFreq, numDocs);
629 float score = tf * idf;
630
631
632
633 res.add(new Word(wordId, score, idf, docFreq, tf));
634 }
635 return res;
636 }
637
638
639
640
641 protected class Word implements Comparable < Word > {
642
643
644
645
646 private Integer id;
647
648
649
650
651 private float score;
652
653
654
655
656 private float idf;
657
658
659
660
661 private int docFreq;
662
663
664
665
666 private int tf;
667
668 public Word(Integer id, float score, float idf, int docFreq, int tf) {
669 super();
670 this.id = id;
671 this.score = score;
672 this.idf = idf;
673 this.docFreq = docFreq;
674 this.tf = tf;
675 }
676
677 public Integer getId() {
678 return id;
679 }
680
681 public float getScore() {
682 return score;
683 }
684
685 public float getIdf() {
686 return idf;
687 }
688
689 public int getDocFreq() {
690 return docFreq;
691 }
692
693 public int getTf() {
694 return tf;
695 }
696
697 public int compareTo(Word w) {
698 int res = 0;
699 if (score < w.score) {
700 res = -1;
701 } else if (score > w.score) {
702 res = 1;
703 }
704 return res * -1;
705 }
706
707 }
708
709 public float getMSetScoreThreshold() {
710 return mSetScoreThreshold;
711 }
712
713 public void setMSetScoreThreshold(float setScoreThreshold) {
714 mSetScoreThreshold = setScoreThreshold;
715 }
716
717 public float getSetScoreThreshold() {
718 return setScoreThreshold;
719 }
720
721 public void setSetScoreThreshold(float setScoreThreshold) {
722 this.setScoreThreshold = setScoreThreshold;
723 }
724
725 public boolean isValidationMode() {
726 return validationMode;
727 }
728
729 public void setValidationMode(boolean validationMode) {
730 this.validationMode = validationMode;
731 }
732
733
734 }