1 package org.kit.furia.index;
2
3 import java.io.ByteArrayInputStream;
4 import java.io.File;
5 import java.io.IOException;
6 import java.util.Iterator;
7 import java.util.LinkedList;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.PriorityQueue;
11
12 import org.ajmm.obsearch.Index;
13 import org.ajmm.obsearch.OB;
14 import org.ajmm.obsearch.Result;
15 import org.ajmm.obsearch.exception.AlreadyFrozenException;
16 import org.ajmm.obsearch.exception.IllegalIdException;
17 import org.ajmm.obsearch.exception.OBException;
18 import org.ajmm.obsearch.exception.OutOfRangeException;
19 import org.ajmm.obsearch.exception.UndefinedPivotsException;
20 import org.apache.log4j.Logger;
21 import org.apache.lucene.analysis.WhitespaceAnalyzer;
22 import org.apache.lucene.document.Field;
23 import org.apache.lucene.index.CorruptIndexException;
24 import org.apache.lucene.index.IndexReader;
25 import org.apache.lucene.index.IndexWriter;
26 import org.apache.lucene.index.Term;
27 import org.apache.lucene.index.TermDocs;
28 import org.apache.lucene.index.TermFreqVector;
29 import org.apache.lucene.search.BooleanClause;
30 import org.apache.lucene.search.BooleanQuery;
31 import org.apache.lucene.search.DefaultSimilarity;
32 import org.apache.lucene.search.Hit;
33 import org.apache.lucene.search.HitCollector;
34 import org.apache.lucene.search.Hits;
35 import org.apache.lucene.search.IndexSearcher;
36 import org.apache.lucene.search.Query;
37 import org.apache.lucene.search.Searcher;
38 import org.apache.lucene.search.Similarity;
39 import org.apache.lucene.search.TermQuery;
40 import org.apache.lucene.search.similar.MoreLikeThis;
41 import org.apache.lucene.search.similar.MoreLikeThisQuery;
42 import org.apache.lucene.search.similar.SimilarityQueries;
43 import org.kit.furia.Document;
44 import org.kit.furia.IRIndex;
45 import org.kit.furia.ResultCandidate;
46 import org.kit.furia.Document.DocumentElement;
47 import org.kit.furia.exceptions.IRException;
48
49 import com.sleepycat.bind.tuple.TupleInput;
50 import com.sleepycat.bind.tuple.TupleOutput;
51 import com.sleepycat.je.DatabaseException;
52
53 /*
54 Furia-chan: An Open Source software license violation detector.
55 Copyright (C) 2007 Kyushu Institute of Technology
56
57 This program is free software: you can redistribute it and/or modify
58 it under the terms of the GNU General Public License as published by
59 the Free Software Foundation, either version 3 of the License, or
60 (at your option) any later version.
61
62 This program is distributed in the hope that it will be useful,
63 but WITHOUT ANY WARRANTY; without even the implied warranty of
64 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
65 GNU General Public License for more details.
66
67 You should have received a copy of the GNU General Public License
68 along with this program. If not, see <http://www.gnu.org/licenses/>.
69 */
70
71 /**
72 * AbstractIRIndex holds the basic functionality for an Information Retrieval
73 * system that works on OB objects (please see www.obsearch.net). By using a
74 * distance function d, we transform the queries in terms of the closest
75 * elements that are in the database, and once this transformation is performed,
76 * we utilize an information retrieval system (Apache's Lucene) to perform the
77 * matching.
78 * @author Arnoldo Jose Muller Molina
79 * @param <O>
80 * The basic unit in which all the information is divided. In the
81 * case of natural language documents, this would be a word.
82 * @since 0
83 */
84
85 public abstract class AbstractIRIndex < O extends OB > implements IRIndex < O > {
86
87 /**
88 * Logger.
89 */
90 private static final Logger logger = Logger
91 .getLogger(AbstractIRIndex.class);
92
93 /**
94 * Lucene has the concepts of fields of a document. They are analogous to
95 * columns of a DB table (SQL). This enumeration lists the name of the
96 * fields our IRIndex will use.
97 */
98 protected enum FieldName {
99 /*
100 * The words of the document (data to be stored but used by Lucene to
101 * actually compute the score)
102 */
103 WORDS,
104 /* The name of the document */
105 DOC_NAME,
106 /* Cardinality of the multi-set of OB objects in a document */
107 MULTI_SET_SIZE,
108
109 /* Details of the doc word-freq */
110 DOC_DETAILS,
111
112 /* Cardinality of the set of OB objects in a document */
113 SET_SIZE,
114 }
115
116 /**
117 * This object is used to add elements to the index.
118 */
119 protected IndexWriter indexWriter;
120
121 /**
122 * This object is used to read different data from the index.
123 */
124 protected IndexReader indexReader;
125
126 /**
127 * This object is used to search the index;
128 */
129 protected Searcher searcher;
130
131
132 /**
133 * At least the given naive mset score must be obtained to consider a term in the result.
134 */
135 protected float mSetScoreThreshold = 0.32f;
136
137 /**
138 * At least the given naive set score must be obtained to consider a term in the result.
139 */
140 protected float setScoreThreshold = 0.04f;
141
142 /**
143 * Tells whether or not the index is in validation mode.
144 */
145 protected boolean validationMode = false;
146
147 /**
148 * The index where all the data will be stored.
149 */
150
151 /**
152 * Creates a new IR index if none is available in the given path.
153 * @param dbFolder
154 * The folder in which Lucene's files will be stored
155 * @throws IOException
156 * If the given directory does not exist or if some other IO
157 * error occurs
158 */
159 public AbstractIRIndex(File dbFolder) throws IOException {
160 indexWriter = new IndexWriter(dbFolder, new WhitespaceAnalyzer());
161 indexReader = IndexReader.open(dbFolder);
162 searcher = new IndexSearcher(indexReader);
163 }
164
165 public int delete(String documentName) throws IRException {
166 int res = 0;
167 try {
168 res = indexReader.deleteDocuments(new Term(FieldName.DOC_NAME
169 .toString(), documentName));
170 } catch (Exception e) {
171 throw new IRException(e);
172 }
173 return res;
174 }
175
176 /**
177 * Returns true if the document corresponding
178 * to x's name exists in the DB. This method is intended to be
179 * used in validation mode only.
180 * @param x
181 * @return true if the DB does not contain a document with name x.getName()
182 */
183 public boolean shouldSkipDoc(Document<O> x) throws IOException{
184
185 //TermDocs td = this.indexReader.termDocs(new Term(FieldName.DOC_NAME.toString(), x.getName()));
186 //org.apache.lucene.document.Document document = searcher.doc(td.doc());
187
188 return this.indexReader.docFreq(new Term(FieldName.DOC_NAME.toString(), x.getName()) )< 1;
189 }
190
191 /**
192 * Calculates the ResultCandidate between a normalized query and a Lucene document.
193 * @return A result candidate for the given document and normalized query.
194 */
195 protected ResultCandidate calculateSimilarity( org.apache.lucene.document.Document document , Map < Integer, Integer > normalizedQuery, float score){
196 String docName = document.getField(
197 FieldName.DOC_NAME.toString()).stringValue();
198 TupleInput in = new TupleInput(document.getField(
199 FieldName.MULTI_SET_SIZE.toString()).binaryValue());
200 // size of the doc in the DB.
201 int docSizeMSet = in.readInt();
202
203 int docSizeSet = new TupleInput(document.getField(
204 FieldName.SET_SIZE.toString()).binaryValue())
205 .readInt();
206
207 int cx = 0;
208 int intersectionQueryMSet = 0;
209 int intersectionQuerySet = 0;
210 TupleInput freqs = new TupleInput(document.getField(
211 FieldName.DOC_DETAILS.toString()).binaryValue());
212 while (cx < docSizeSet) {
213 int wordId = freqs.readInt();
214 int frecuency = freqs.readInt();
215 Integer frequencyQuery = normalizedQuery.get(wordId);
216 if (frequencyQuery != null) {
217 intersectionQuerySet++;
218 intersectionQueryMSet += Math.min(frecuency,
219 frequencyQuery);
220 }
221 cx++;
222 }
223 ResultCandidate can = new ResultCandidate(docName, score,
224 intersectionQueryMSet, docSizeMSet,
225 intersectionQuerySet, docSizeSet);
226 return can;
227 }
228
229 private class FHitCollector extends HitCollector{
230
231 PriorityQueue < ResultCandidate > candidates;
232 Map < Integer, Integer > normalizedQuery;
233 private boolean found = false; /* to be used only in validation mode */
234 private String queryName; /* name of the query, only used in validation mode */
235 private ResultCandidate match; /* the document, only used in validation mode */
236 public FHitCollector(Map < Integer, Integer > normalizedQuery, String queryName){
237 this.candidates = new PriorityQueue < ResultCandidate >();
238 this.normalizedQuery = normalizedQuery;
239 this.queryName = queryName;
240 }
241
242
243
244 public ResultCandidate getMatch() {
245 return match;
246 }
247
248
249
250 public boolean isFound() {
251 return found;
252 }
253
254 public void collect(int doc, float score) {
255
256 org.apache.lucene.document.Document document = null;
257 try{
258 document = searcher.doc(doc);
259 }catch(IOException e){
260 logger.fatal("This is bad", e);
261 candidates =null;
262 normalizedQuery = null;
263 assert false;
264 }
265
266 ResultCandidate can = calculateSimilarity(document, normalizedQuery, score);
267 if(validationMode){
268 String docName = document.getField(
269 FieldName.DOC_NAME.toString()).stringValue();
270 if(docName.equals(queryName)){
271 found = true;
272 logger.debug("Collector: Found :)" + can.toString());
273 }
274
275 /* if(! shouldAddToTheFinalResult(can)){
276 candidates.add(can); // only in validation mode, we add the result so that we can analyze everything
277 }*/
278 }
279
280 if(shouldAddToTheFinalResult(can)){
281 candidates.add(can);
282 }
283
284 }
285
286 private boolean shouldAddToTheFinalResult(ResultCandidate can){
287 return can.getNaiveScoreMSet() >= mSetScoreThreshold && can.getNaiveScoreSet() >= setScoreThreshold;
288 }
289
290
291
292 public List<ResultCandidate> getResults(){
293 List<ResultCandidate> res = new LinkedList<ResultCandidate>();
294 ResultCandidate cur;
295 int total = 0;
296 while (((cur = candidates.poll()) != null)) {
297 res.add(cur);
298 }
299 return res;
300 }
301 }
302
303 /**
304 * Returns the # of documents in this DB.
305 * @return
306 */
307 public int getSize(){
308 return this.indexReader.numDocs();
309 }
310
311 protected List < ResultCandidate > processQueryResults(
312 Map < Integer, Integer > normalizedQuery, short n, Document query)
313 throws IRException {
314 List < ResultCandidate > res = new LinkedList < ResultCandidate >();
315 try {
316 // at this stage we have created a map that holds all the matched
317 // elements,
318 // the initial document that is in terms of the original objects
319 // (obfuscated fragments in the case of furia)
320 // is now transformed in terms of the database.
321 // we now generate a priority queue with the terms, in order to sort
322 // them and put the most
323 // relevant at the beginning of the query.
324
325
326 if (normalizedQuery.size() > 0) {
327 PriorityQueue < Word > sorted = createPriorityQueue(normalizedQuery);
328 Query q = createQuery(sorted);
329 FHitCollector collector = new FHitCollector(normalizedQuery, query.getName());
330 // now we just perform the search and return the results.
331 searcher.search(q,collector);
332 /*if(validationMode && ! collector.isFound()){
333 // we did not found the doc. Print the stats of the doc so we can understand what is going on.
334 TermDocs td = this.indexReader.termDocs(new Term(FieldName.DOC_NAME.toString(), query.getName()));
335 org.apache.lucene.document.Document document = null;
336 try{
337 document = searcher.doc(td.doc());
338 }catch(IOException e){
339 logger.fatal("This is bad", e);
340 throw new IRException(e);
341 }
342 logger.info("Forced: " + calculateSimilarity(document, normalizedQuery, -1.0f).toString());
343 }*/
344 return(collector.getResults());
345 }
346 }catch (IOException e) {
347 throw new IRException(e);
348 }
349 return res;
350 }
351
352 /**
353 * Receives a map with normalized OB_id -> freq and returns the closest
354 * elements to the original query
355 * @param normalizedQuery
356 * A map that contains OB_id -> freq
357 * @param n
358 * Return the top n results only.
359 * @return The top n candidates.
360 */
361 /*protected List < ResultCandidate > processQueryResults(
362 Map < Integer, Integer > normalizedQuery, short n)
363 throws IRException {
364 try {
365 // at this stage we have created a map that holds all the matched
366 // elements,
367 // the initial document that is in terms of the original objects
368 // (obfuscated fragments in the case of furia)
369 // is now transformed in terms of the database.
370 // we now generate a priority queue with the terms, in order to sort
371 // them and put the most
372 // relevant at the beginning of the query.
373
374 List < ResultCandidate > res = new LinkedList < ResultCandidate >();
375 if (normalizedQuery.size() > 0) {
376 PriorityQueue < Word > sorted = createPriorityQueue(normalizedQuery);
377 Query q = createQuery(sorted);
378
379 // now we just perform the search and return the results.
380 Hits hits = searcher.search(q);
381 if(logger.isDebugEnabled()){
382 logger.info("Hits size: " + hits.length());
383 }
384
385 Iterator < Hit > it = hits.iterator();
386 int i = 0;
387 while (it.hasNext() && i < n) {
388 Hit h = it.next();
389 String docName = h.getDocument().getField(
390 FieldName.DOC_NAME.toString()).stringValue();
391 TupleInput in = new TupleInput(h.getDocument().getField(
392 FieldName.MULTI_SET_SIZE.toString()).binaryValue());
393 // size of the doc in the DB.
394 int docSizeMSet = in.readInt();
395
396 int docSizeSet = new TupleInput(h.getDocument().getField(
397 FieldName.SET_SIZE.toString()).binaryValue())
398 .readInt();
399
400 int cx = 0;
401 int intersectionQueryMSet = 0;
402 int intersectionQuerySet = 0;
403 TupleInput freqs = new TupleInput(h.getDocument().getField(
404 FieldName.DOC_DETAILS.toString()).binaryValue());
405 while (cx < docSizeSet) {
406 int wordId = freqs.readInt();
407 int frecuency = freqs.readInt();
408 Integer frequencyQuery = normalizedQuery.get(wordId);
409 if (frequencyQuery != null) {
410 intersectionQuerySet++;
411 intersectionQueryMSet += Math.min(frecuency,
412 frequencyQuery);
413 }
414 cx++;
415 }
416 res.add(new ResultCandidate(docName, h.getScore(),
417 intersectionQueryMSet, docSizeMSet,
418 intersectionQuerySet, docSizeSet));
419 i++;
420 }
421 }
422 return res;
423 } catch (IOException e) {
424 throw new IRException(e);
425 }
426 }*/
427
428 /**
429 * Repeats times times the given string, separates everything with a space
430 * @param x
431 * @param times
432 * @return
433 */
434 private StringBuilder repeat(int x, int times) {
435 int i = 0;
436 assert times > 0;
437 StringBuilder res = new StringBuilder();
438 while (i < times) {
439 res.append(Integer.toString(x));
440 res.append(" ");
441 i++;
442 }
443 return res;
444 }
445
446
447
448 public void insert(Document < O > document) throws IRException {
449 // OBSearch index that we use for "stemming".
450 assert document.size() != 0;
451 try {
452 Index < O > index = getIndex();
453 Iterator < Document < O >.DocumentElement < O > > it = document
454 .iterator();
455
456 StringBuilder contents = new StringBuilder();
457 int i = 0;
458 long prevTime = System.currentTimeMillis();
459 // write the multi-set representation of the document into lucene.
460 TupleOutput mSetOut = new TupleOutput();
461
462 while (it.hasNext()) {
463 Document < O >.DocumentElement < O > elem = it.next();
464 Result res = index.insert(elem.getObject());
465 assert res.getStatus() == Result.Status.OK
466 || res.getStatus() == Result.Status.EXISTS;
467 contents.append(repeat(res.getId(), elem.getCount()));
468 mSetOut.writeInt(res.getId());
469 mSetOut.writeInt(elem.getCount());
470 i++;
471 }
472
473
474 // now we just have to create the fields
475 Field contentsField = new Field(FieldName.WORDS.toString(),
476 contents.toString(), Field.Store.NO, Field.Index.TOKENIZED);
477 Field docName = new Field(FieldName.DOC_NAME.toString(), document
478 .getName(), Field.Store.YES, Field.Index.UN_TOKENIZED);
479 TupleOutput out = new TupleOutput();
480 out.writeInt(document.multiSetSize());
481 Field multiSetSize = new Field(FieldName.MULTI_SET_SIZE.toString(),
482 out.getBufferBytes(), Field.Store.YES);
483
484 Field docDetails = new Field(FieldName.DOC_DETAILS.toString(),
485 mSetOut.getBufferBytes(), Field.Store.YES);
486
487 TupleOutput out2 = new TupleOutput();
488 out2.writeInt(document.size());
489 Field setSize = new Field(FieldName.SET_SIZE.toString(), out2
490 .getBufferBytes(), Field.Store.YES);
491
492 org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
493 doc.add(contentsField);
494 doc.add(docName);
495 doc.add(multiSetSize);
496 doc.add(setSize);
497 doc.add(docDetails);
498 indexWriter.addDocument(doc);
499 } catch (Exception e) {
500 throw new IRException(e);
501 }
502 }
503
504 public void freeze() throws IRException {
505 try {
506 logger.debug("Freezing index, OB objects: "
507 + this.getIndex().databaseSize());
508 getIndex().freeze();
509 logger.debug("Optimizing Lucene");
510 indexWriter.optimize();
511 } catch (Exception e) {
512 logger.fatal("Error in freeze: ", e);
513 throw new IRException(e);
514 }
515 }
516
517 public void close() throws IRException {
518 try {
519 getIndex().close();
520 indexWriter.close();
521 indexReader.close();
522 } catch (Exception e) {
523 throw new IRException(e);
524 }
525 }
526 /*
527 private Query createQuery(PriorityQueue < Word > q) {
528 StringBuilder contents = new StringBuilder();
529 Word cur;
530 int total = 0;
531 while (((cur = q.poll()) != null)) {
532 total += cur.getDocFreq();
533 contents.append(repeat(cur.getId(), cur.getDocFreq()));
534 //contents.append(cur.getId());
535 //contents.append(" ");
536 }
537
538 MoreLikeThis yay = new MoreLikeThis(indexReader);
539 yay.setAnalyzer(new WhitespaceAnalyzer());
540 yay.setBoost(true);
541 yay.setMaxNumTokensParsed(total);
542 yay.setMaxQueryTerms(0);
543
544 yay.setFieldNames(new String[] { FieldName.WORDS.toString() });
545 Query res = null;
546 try{
547 res = yay.like(new ByteArrayInputStream(contents.toString().getBytes()));
548 }catch(IOException e){
549 logger.fatal("Error while creating query", e);
550 }
551 return res;
552
553 //MoreLikeThisQuery res = new MoreLikeThisQuery(contents.toString(), new String[]{ FieldName.WORDS.toString() }, new WhitespaceAnalyzer() );
554
555 //res.setMaxQueryTerms(100);
556 //res.setPercentTermsToMatch(0.3f);
557 //return res;
558
559 //Query res = null;
560 //try{
561 // res = SimilarityQueries.formSimilarQuery(contents.toString(), new WhitespaceAnalyzer(), FieldName.WORDS.toString(), null);
562 //}catch(IOException e){
563 // logger.fatal("Problem while creating Query", e);
564 //}
565 //return res;
566 }*/
567
568 /**
569 * Create the "more like" query from a PriorityQueue. (This code was
570 * borrowed from lucene-contrib)
571 */
572 private Query createQuery(PriorityQueue < Word > q) {
573 BooleanQuery query = new BooleanQuery(true); // aqui va el true, da
574 // mejores resultados.
575 // query.setUseScorer14(true); // no effect
576 query.setAllowDocsOutOfOrder(true);
577 //query.setMinimumNumberShouldMatch(100);
578 BooleanQuery.setMaxClauseCount(q.size());
579 boolean boost = true;
580 Word cur;
581 int qterms = 0;
582 // float bestScore = 0;
583 float bestScore = 0;
584 //while (((cur = q.poll()) != null) && qterms < 100) {
585 while (((cur = q.poll()) != null)) {
586 TermQuery tq = new TermQuery(new Term(FieldName.WORDS.toString(),
587 cur.getId().toString()));
588
589 //if (boost) { if (qterms == 0) { bestScore = cur.getScore(); }
590 //float myScore = cur.getScore(); tq.setBoost(myScore / bestScore); }
591 //tq.setBoost( (float)cur.getDocFreq()/ (float)q.size());
592
593
594 query.add(tq, BooleanClause.Occur.SHOULD);
595 qterms++;
596 }
597 return query;
598 }
599
600 /**
601 * Create a PriorityQueue from a word->tf map. (This code was borrowed from
602 * lucene-contrib)
603 * @param words
604 * a map of words keyed on the word(String) with Int objects
605 * as the values.
606 * @return A priority queue ordered by the most important word.
607 */
608 protected PriorityQueue < Word > createPriorityQueue(
609 Map < Integer, Integer > words) throws IOException {
610 // have collected all words in doc and their freqs
611 assert words.size() > 0;
612 int numDocs = this.indexReader.numDocs();
613 PriorityQueue < Word > res = new PriorityQueue < Word >(words.size());
614 Similarity similarity = new DefaultSimilarity();
615 Iterator < Integer > it = words.keySet().iterator();
616 while (it.hasNext()) { // for every word
617 Integer wordId = it.next();
618 Integer repetition = words.get(wordId);
619 int tf = repetition; // term freq in the source doc
620
621 int docFreq = indexReader.docFreq(new Term(FieldName.WORDS
622 .toString(), wordId.toString()));
623
624 if (docFreq == 0) {
625 continue; // index update problem?
626 }
627
628 float idf = similarity.idf(docFreq, numDocs);
629 float score = tf * idf;
630
631 // only really need 1st 3 entries, other ones are for
632 // troubleshooting
633 res.add(new Word(wordId, score, idf, docFreq, tf));
634 }
635 return res;
636 }
637
638 /**
639 * Represents an OB object.
640 */
641 protected class Word implements Comparable < Word > {
642
643 /**
644 * Id of the word.
645 */
646 private Integer id;
647
648 /**
649 * Overall score.
650 */
651 private float score;
652
653 /**
654 * idf.
655 */
656 private float idf;
657
658 /**
659 * # of docs where this word appears.
660 */
661 private int docFreq;
662
663 /**
664 * tf.
665 */
666 private int tf;
667
668 public Word(Integer id, float score, float idf, int docFreq, int tf) {
669 super();
670 this.id = id;
671 this.score = score;
672 this.idf = idf;
673 this.docFreq = docFreq;
674 this.tf = tf;
675 }
676
677 public Integer getId() {
678 return id;
679 }
680
681 public float getScore() {
682 return score;
683 }
684
685 public float getIdf() {
686 return idf;
687 }
688
689 public int getDocFreq() {
690 return docFreq;
691 }
692
693 public int getTf() {
694 return tf;
695 }
696
697 public int compareTo(Word w) {
698 int res = 0;
699 if (score < w.score) {
700 res = -1;
701 } else if (score > w.score) {
702 res = 1;
703 }// else they are equal
704 return res * -1;// invert the result
705 }
706
707 }
708
709 public float getMSetScoreThreshold() {
710 return mSetScoreThreshold;
711 }
712
713 public void setMSetScoreThreshold(float setScoreThreshold) {
714 mSetScoreThreshold = setScoreThreshold;
715 }
716
717 public float getSetScoreThreshold() {
718 return setScoreThreshold;
719 }
720
721 public void setSetScoreThreshold(float setScoreThreshold) {
722 this.setScoreThreshold = setScoreThreshold;
723 }
724
725 public boolean isValidationMode() {
726 return validationMode;
727 }
728
729 public void setValidationMode(boolean validationMode) {
730 this.validationMode = validationMode;
731 }
732
733
734 }