1 package org.kit.furia; 2 3 import java.util.HashMap; 4 import java.util.Iterator; 5 import java.util.Map; 6 7 import org.ajmm.obsearch.OB; 8 import org.kit.furia.misc.IntegerHolder; 9 10 /* 11 Furia-chan: An Open Source software license violation detector. 12 Copyright (C) 2007 Kyushu Institute of Technology 13 14 This program is free software: you can redistribute it and/or modify 15 it under the terms of the GNU General Public License as published by 16 the Free Software Foundation, either version 3 of the License, or 17 (at your option) any later version. 18 19 This program is distributed in the hope that it will be useful, 20 but WITHOUT ANY WARRANTY; without even the implied warranty of 21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 GNU General Public License for more details. 23 24 You should have received a copy of the GNU General Public License 25 along with this program. If not, see <http://www.gnu.org/licenses/>. 26 */ 27 28 /** 29 * This class is a multi-set of OB objects. If we were to store natural 30 * language, a document is just a multi-set of natural language words. 31 * Relationships of the words within the document are not stored. 32 * @param <O> 33 * The type of OB object that will be stored in this document. 34 * @author Arnoldo Jose Muller Molina 35 * @since 0 36 */ 37 38 public class Document < O extends OB > { 39 40 /** 41 * Contains each of the objects and the amount of times they appear in the 42 * document. 43 */ 44 private Map < O, DocumentElement < O > > data; 45 46 /** 47 * The name (identification string) of this document. 48 */ 49 private String name; 50 51 /** 52 * The size of the multi-set of the words of this document. 53 */ 54 private int wordCountMultiSet; 55 56 57 public String getName() { 58 return name; 59 } 60 61 /** 62 * Creates a document with an initial estimate of 2000 elements. 63 * @param id 64 * The id of the document. 65 */ 66 public Document(String id) { 67 this(id, 2000); 68 } 69 70 /** 71 * @return The size of the set of words contained in this document. 72 */ 73 public int size(){ 74 return data.size(); 75 } 76 77 /** 78 * 79 * @return The size of the multi-set of words contained in this document. 80 */ 81 public int multiSetSize(){ 82 return wordCountMultiSet; 83 } 84 85 /** 86 * Creates a document. 87 * @param initialCapacity 88 * The number of elements that we are expecting to hold. This 89 * is for efficiency reasons, as the Document will grow 90 * automatically if the number of elements exceeds this 91 * initial estimate. 92 * @param id 93 * The id of the document. 94 */ 95 public Document(String id, int initialCapacity) { 96 data = new HashMap < O, DocumentElement < O > >(initialCapacity); 97 this.name = id; 98 wordCountMultiSet = 0; 99 } 100 101 /** 102 * Adds a word to the document. 103 * @param word 104 * The word that will be added. 105 */ 106 public void addWord(O word) { 107 DocumentElement < O > r = data.get(word); 108 if (r == null) { 109 // this is the first time we add this word, so 110 // we should initialize the counter for "word" 111 r = new DocumentElement < O >(word, new IntegerHolder(0)); 112 data.put(word, r); 113 } 114 // increment the number of words in the document. 115 r.inc(); 116 wordCountMultiSet++; 117 } 118 119 /** 120 * Sets the multiplicity for the given word. 121 * @param word 122 * @param multiplicity 123 */ 124 public void setWord(O word, int multiplicity){ 125 // we cannot have an existing word here, because the Furia-chan file format 126 // holds one item per line. All the items are different. 127 assert data.get(word) == null; 128 data.put(word, new DocumentElement<O>(word, new IntegerHolder(multiplicity))); 129 wordCountMultiSet += multiplicity; 130 } 131 132 /** 133 * @return An iterator with all the elements of this document. 134 */ 135 public Iterator < DocumentElement < O >> iterator() { 136 return data.values().iterator(); 137 } 138 139 /** 140 * This class is used by the iterator of the Document class. It holds the O 141 * object and the number of times it appears in this document. 142 * @param <O> 143 * The type of OB object that will be stored in this 144 * document. 145 * @author Arnoldo Jose Muller Molina 146 * @since 0 147 */ 148 public class DocumentElement < T > { 149 private T object; 150 151 private IntegerHolder count; 152 153 public DocumentElement(T object, IntegerHolder count) { 154 super(); 155 this.object = object; 156 this.count = count; 157 } 158 159 /** 160 * @return The object that composes this element of the document. 161 */ 162 public T getObject() { 163 return object; 164 } 165 166 /** 167 * @return The # of times this object has appeared in the document. 168 */ 169 public int getCount() { 170 return count.getValue(); 171 } 172 173 /** 174 * Increments the count for object. 175 */ 176 protected void inc() { 177 count.inc(); 178 } 179 } 180 181 }