1 package org.kit.furia;
2
3 import java.util.HashMap;
4 import java.util.Iterator;
5 import java.util.Map;
6
7 import org.ajmm.obsearch.OB;
8 import org.kit.furia.misc.IntegerHolder;
9
10 /*
11 Furia-chan: An Open Source software license violation detector.
12 Copyright (C) 2007 Kyushu Institute of Technology
13
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 */
27
28 /**
29 * This class is a multi-set of OB objects. If we were to store natural
30 * language, a document is just a multi-set of natural language words.
31 * Relationships of the words within the document are not stored.
32 * @param <O>
33 * The type of OB object that will be stored in this document.
34 * @author Arnoldo Jose Muller Molina
35 * @since 0
36 */
37
38 public class Document < O extends OB > {
39
40 /**
41 * Contains each of the objects and the amount of times they appear in the
42 * document.
43 */
44 private Map < O, DocumentElement < O > > data;
45
46 /**
47 * The name (identification string) of this document.
48 */
49 private String name;
50
51 /**
52 * The size of the multi-set of the words of this document.
53 */
54 private int wordCountMultiSet;
55
56
57 public String getName() {
58 return name;
59 }
60
61 /**
62 * Creates a document with an initial estimate of 2000 elements.
63 * @param id
64 * The id of the document.
65 */
66 public Document(String id) {
67 this(id, 2000);
68 }
69
70 /**
71 * @return The size of the set of words contained in this document.
72 */
73 public int size(){
74 return data.size();
75 }
76
77 /**
78 *
79 * @return The size of the multi-set of words contained in this document.
80 */
81 public int multiSetSize(){
82 return wordCountMultiSet;
83 }
84
85 /**
86 * Creates a document.
87 * @param initialCapacity
88 * The number of elements that we are expecting to hold. This
89 * is for efficiency reasons, as the Document will grow
90 * automatically if the number of elements exceeds this
91 * initial estimate.
92 * @param id
93 * The id of the document.
94 */
95 public Document(String id, int initialCapacity) {
96 data = new HashMap < O, DocumentElement < O > >(initialCapacity);
97 this.name = id;
98 wordCountMultiSet = 0;
99 }
100
101 /**
102 * Adds a word to the document.
103 * @param word
104 * The word that will be added.
105 */
106 public void addWord(O word) {
107 DocumentElement < O > r = data.get(word);
108 if (r == null) {
109 // this is the first time we add this word, so
110 // we should initialize the counter for "word"
111 r = new DocumentElement < O >(word, new IntegerHolder(0));
112 data.put(word, r);
113 }
114 // increment the number of words in the document.
115 r.inc();
116 wordCountMultiSet++;
117 }
118
119 /**
120 * Sets the multiplicity for the given word.
121 * @param word
122 * @param multiplicity
123 */
124 public void setWord(O word, int multiplicity){
125 // we cannot have an existing word here, because the Furia-chan file format
126 // holds one item per line. All the items are different.
127 assert data.get(word) == null;
128 data.put(word, new DocumentElement<O>(word, new IntegerHolder(multiplicity)));
129 wordCountMultiSet += multiplicity;
130 }
131
132 /**
133 * @return An iterator with all the elements of this document.
134 */
135 public Iterator < DocumentElement < O >> iterator() {
136 return data.values().iterator();
137 }
138
139 /**
140 * This class is used by the iterator of the Document class. It holds the O
141 * object and the number of times it appears in this document.
142 * @param <O>
143 * The type of OB object that will be stored in this
144 * document.
145 * @author Arnoldo Jose Muller Molina
146 * @since 0
147 */
148 public class DocumentElement < T > {
149 private T object;
150
151 private IntegerHolder count;
152
153 public DocumentElement(T object, IntegerHolder count) {
154 super();
155 this.object = object;
156 this.count = count;
157 }
158
159 /**
160 * @return The object that composes this element of the document.
161 */
162 public T getObject() {
163 return object;
164 }
165
166 /**
167 * @return The # of times this object has appeared in the document.
168 */
169 public int getCount() {
170 return count.getValue();
171 }
172
173 /**
174 * Increments the count for object.
175 */
176 protected void inc() {
177 count.inc();
178 }
179 }
180
181 }