View Javadoc

1   package org.kit.furia;
2   
3   import org.ajmm.obsearch.OB;
4   
5   import java.io.IOException;
6   import java.util.List;
7   import org.ajmm.obsearch.Index;
8   import org.ajmm.obsearch.exception.AlreadyFrozenException;
9   import org.ajmm.obsearch.exception.IllegalIdException;
10  import org.ajmm.obsearch.exception.OBException;
11  import org.ajmm.obsearch.exception.OutOfRangeException;
12  import org.ajmm.obsearch.exception.UndefinedPivotsException;
13  import org.apache.lucene.index.CorruptIndexException;
14  import org.kit.furia.exceptions.IRException;
15  
16  import com.sleepycat.je.DatabaseException;
17  
18  /*
19   Furia-chan: An Open Source software license violation detector.
20   Copyright (C) 2007 Kyushu Institute of Technology
21  
22   This program is free software: you can redistribute it and/or modify
23   it under the terms of the GNU General Public License as published by
24   the Free Software Foundation, either version 3 of the License, or
25   (at your option) any later version.
26  
27   This program is distributed in the hope that it will be useful,
28   but WITHOUT ANY WARRANTY; without even the implied warranty of
29   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30   GNU General Public License for more details.
31  
32   You should have received a copy of the GNU General Public License
33   along with this program.  If not, see <http://www.gnu.org/licenses/>.
34   */
35  
36  /**
37   * IRIndex holds the basic functionality for an Information Retrieval system
38   * that works on OB objects (please see obsearch.berlios.de). By using a
39   * distance function d, we transform the queries in terms of the closest
40   * elements that are in the database, and once this transformation is performed,
41   * we utilize an information retrieval system to perform the matching. Because
42   * our documents are multi-sets, the distribution of OB objects inside a
43   * document is taken into account. So, instead of matching a huge syntax tree of
44   * for example, music, we cut a song into pieces, match the pieces and then the
45   * overall finger-print of the multi-set of OB objects is matched.
46   * @author Arnoldo Jose Muller Molina
47   * @since 0
48   */
49  public interface IRIndex < O extends OB > {
50  
51      /**
52       * Inserts a new document into the database.
53       * @param document
54       *                The document to be inserted.
55       * @throws IRException
56       *                 If something goes wrong with the IR engine or with
57       *                 OBSearch.
58       */
59      void insert(Document < O > document) throws IRException;
60  
61      /**
62       * Deletes the given string document from the database. If more than one
63       * documents have the same name, all the documents will be erased.
64       * @return The number of documents deleted.
65       * @throws IRException
66       *                 If something goes wrong with the IR engine or with
67       *                 OBSearch.
68       */
69      int delete(String documentName) throws IRException;
70  
71      /**
72       * Returns the underlying OBSearch index.
73       * @return the underlying OBSearch index.
74       */
75      Index < O > getIndex();
76  
77      /**
78       * Freezes the index. From this point data can be inserted, searched and
79       * deleted. The index might deteriorate at some point so every once in a
80       * while it is a good idea to rebuild the index. This method will also
81       * @throws IRException
82       *                 If something goes wrong with the IR engine or with
83       *                 OBSearch.
84       */
85      void freeze() throws IRException;
86  
87      /**
88       * Closes the databases. You *should* close the databases after using an
89       * IRIndex.
90       * @throws IRException
91       *                 If something goes wrong with the IR engine or with
92       *                 OBSearch.
93       */
94      void close() throws IRException;
95  
96      /**
97       * Returns the number of documents stored in this index.
98       * @return the number of documents stored in this index.
99       */
100     int getSize();
101 
102     /**
103      * Returns true if the document corresponding to x's name exists in the DB.
104      * This method is intended to be used in validation mode only.
105      * @param x
106      * @return true if the DB does not contain a document with name x.getName()
107      */
108     boolean shouldSkipDoc(Document<O> x) throws IOException;
109     
110     /**
111      * The M-set score threshold is the minimum naive score for multi-sets
112      * that the index will accept.
113      * @return Returns the current M-set score threshold.
114      */
115     float getMSetScoreThreshold();
116 
117     /**
118      * The M-set score threshold is the minimum naive score for multi-sets
119      * that the index will accept.
120      * @param setScoreThreshold the new threshold
121      */
122     void setMSetScoreThreshold(float setScoreThreshold);
123 
124     /**
125      * * The Set score threshold is the minimum naive score for Sets
126      * that the index will accept.
127      * @return Returns the current Set score threshold.
128      */
129     float getSetScoreThreshold();
130     /**
131      * The Set score threshold is the minimum naive score for Sets
132      * that the index will accept.
133      * @param setScoreThreshold the new threshold
134      */
135     void setSetScoreThreshold(float setScoreThreshold);
136     
137     /**
138      * Returns the count different words that
139      * are used by the documents indexed. 
140      * @return the count different words that
141      * are used by the documents indexed. 
142      */
143     int getWordsSize() throws DatabaseException;
144     
145     /**
146      * Tells whether or not the index is in validation mode. 
147      * In validation mode we assume that documents with the same name are equal.
148      * This helps us to add additional statistics on the performance of the scoring technique.
149      * @return true if this index is in validation mode.
150      */
151     boolean isValidationMode();
152     /**
153      * Sets whether or not the index is in validation mode. 
154      * In validation mode we assume that documents with the same name are equal.
155      * This helps us to add additional statistics on the performance of the scoring technique.
156      * @param validationMode The new validation mode.
157      * */
158     void setValidationMode(boolean validationMode);
159 }