1 package org.kit.furia; 2 3 import org.ajmm.obsearch.OB; 4 5 import java.io.IOException; 6 import java.util.List; 7 import org.ajmm.obsearch.Index; 8 import org.ajmm.obsearch.exception.AlreadyFrozenException; 9 import org.ajmm.obsearch.exception.IllegalIdException; 10 import org.ajmm.obsearch.exception.OBException; 11 import org.ajmm.obsearch.exception.OutOfRangeException; 12 import org.ajmm.obsearch.exception.UndefinedPivotsException; 13 import org.apache.lucene.index.CorruptIndexException; 14 import org.kit.furia.exceptions.IRException; 15 16 import com.sleepycat.je.DatabaseException; 17 18 /* 19 Furia-chan: An Open Source software license violation detector. 20 Copyright (C) 2007 Kyushu Institute of Technology 21 22 This program is free software: you can redistribute it and/or modify 23 it under the terms of the GNU General Public License as published by 24 the Free Software Foundation, either version 3 of the License, or 25 (at your option) any later version. 26 27 This program is distributed in the hope that it will be useful, 28 but WITHOUT ANY WARRANTY; without even the implied warranty of 29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 GNU General Public License for more details. 31 32 You should have received a copy of the GNU General Public License 33 along with this program. If not, see <http://www.gnu.org/licenses/>. 34 */ 35 36 /** 37 * IRIndex holds the basic functionality for an Information Retrieval system 38 * that works on OB objects (please see obsearch.berlios.de). By using a 39 * distance function d, we transform the queries in terms of the closest 40 * elements that are in the database, and once this transformation is performed, 41 * we utilize an information retrieval system to perform the matching. Because 42 * our documents are multi-sets, the distribution of OB objects inside a 43 * document is taken into account. So, instead of matching a huge syntax tree of 44 * for example, music, we cut a song into pieces, match the pieces and then the 45 * overall finger-print of the multi-set of OB objects is matched. 46 * @author Arnoldo Jose Muller Molina 47 * @since 0 48 */ 49 public interface IRIndex < O extends OB > { 50 51 /** 52 * Inserts a new document into the database. 53 * @param document 54 * The document to be inserted. 55 * @throws IRException 56 * If something goes wrong with the IR engine or with 57 * OBSearch. 58 */ 59 void insert(Document < O > document) throws IRException; 60 61 /** 62 * Deletes the given string document from the database. If more than one 63 * documents have the same name, all the documents will be erased. 64 * @return The number of documents deleted. 65 * @throws IRException 66 * If something goes wrong with the IR engine or with 67 * OBSearch. 68 */ 69 int delete(String documentName) throws IRException; 70 71 /** 72 * Returns the underlying OBSearch index. 73 * @return the underlying OBSearch index. 74 */ 75 Index < O > getIndex(); 76 77 /** 78 * Freezes the index. From this point data can be inserted, searched and 79 * deleted. The index might deteriorate at some point so every once in a 80 * while it is a good idea to rebuild the index. This method will also 81 * @throws IRException 82 * If something goes wrong with the IR engine or with 83 * OBSearch. 84 */ 85 void freeze() throws IRException; 86 87 /** 88 * Closes the databases. You *should* close the databases after using an 89 * IRIndex. 90 * @throws IRException 91 * If something goes wrong with the IR engine or with 92 * OBSearch. 93 */ 94 void close() throws IRException; 95 96 /** 97 * Returns the number of documents stored in this index. 98 * @return the number of documents stored in this index. 99 */ 100 int getSize(); 101 102 /** 103 * Returns true if the document corresponding to x's name exists in the DB. 104 * This method is intended to be used in validation mode only. 105 * @param x 106 * @return true if the DB does not contain a document with name x.getName() 107 */ 108 boolean shouldSkipDoc(Document<O> x) throws IOException; 109 110 /** 111 * The M-set score threshold is the minimum naive score for multi-sets 112 * that the index will accept. 113 * @return Returns the current M-set score threshold. 114 */ 115 float getMSetScoreThreshold(); 116 117 /** 118 * The M-set score threshold is the minimum naive score for multi-sets 119 * that the index will accept. 120 * @param setScoreThreshold the new threshold 121 */ 122 void setMSetScoreThreshold(float setScoreThreshold); 123 124 /** 125 * * The Set score threshold is the minimum naive score for Sets 126 * that the index will accept. 127 * @return Returns the current Set score threshold. 128 */ 129 float getSetScoreThreshold(); 130 /** 131 * The Set score threshold is the minimum naive score for Sets 132 * that the index will accept. 133 * @param setScoreThreshold the new threshold 134 */ 135 void setSetScoreThreshold(float setScoreThreshold); 136 137 /** 138 * Returns the count different words that 139 * are used by the documents indexed. 140 * @return the count different words that 141 * are used by the documents indexed. 142 */ 143 int getWordsSize() throws DatabaseException; 144 145 /** 146 * Tells whether or not the index is in validation mode. 147 * In validation mode we assume that documents with the same name are equal. 148 * This helps us to add additional statistics on the performance of the scoring technique. 149 * @return true if this index is in validation mode. 150 */ 151 boolean isValidationMode(); 152 /** 153 * Sets whether or not the index is in validation mode. 154 * In validation mode we assume that documents with the same name are equal. 155 * This helps us to add additional statistics on the performance of the scoring technique. 156 * @param validationMode The new validation mode. 157 * */ 158 void setValidationMode(boolean validationMode); 159 }