View Javadoc

1   package org.kit.furia;
2   
3   import java.util.HashMap;
4   import java.util.Iterator;
5   import java.util.Map;
6   
7   import org.ajmm.obsearch.OB;
8   import org.kit.furia.misc.IntegerHolder;
9   
10  /*
11   Furia-chan: An Open Source software license violation detector. 
12   Copyright (C) 2007 Kyushu Institute of Technology
13  
14   This program is free software: you can redistribute it and/or modify
15   it under the terms of the GNU General Public License as published by
16   the Free Software Foundation, either version 3 of the License, or
17   (at your option) any later version.
18  
19   This program is distributed in the hope that it will be useful,
20   but WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22   GNU General Public License for more details.
23  
24   You should have received a copy of the GNU General Public License
25   along with this program.  If not, see <http://www.gnu.org/licenses/>.
26   */
27  
28  /**
29   * This class is a multi-set of OB objects. If we were to store natural
30   * language, a document is just a multi-set of natural language words.
31   * Relationships of the words within the document are not stored.
32   * @param <O>
33   *                The type of OB object that will be stored in this document.
34   * @author Arnoldo Jose Muller Molina
35   * @since 0
36   */
37  
38  public class Document < O extends OB > {
39  
40      /**
41       * Contains each of the objects and the amount of times they appear in the
42       * document.
43       */
44      private Map < O, DocumentElement < O > > data;
45  
46      /**
47       * The name (identification string) of this document.
48       */
49      private String name;
50      
51      /**
52       * The size of the multi-set of the words of this document.
53       */
54      private int wordCountMultiSet;
55  
56  
57      public String getName() {
58          return name;
59      }
60  
61      /**
62       * Creates a document with an initial estimate of 2000 elements.
63       * @param id
64       *                The id of the document.
65       */
66      public Document(String id) {
67          this(id, 2000);
68      }
69      
70      /**
71       * @return The size of the set of words contained in this document.
72       */
73      public int size(){
74          return data.size();
75      }
76      
77      /**
78       * 
79       * @return The size of the multi-set of words contained in this document.
80       */
81      public int multiSetSize(){
82          return wordCountMultiSet;
83      }
84  
85      /**
86       * Creates a document.
87       * @param initialCapacity
88       *                The number of elements that we are expecting to hold. This
89       *                is for efficiency reasons, as the Document will grow
90       *                automatically if the number of elements exceeds this
91       *                initial estimate.
92       * @param id
93       *                The id of the document.
94       */
95      public Document(String id, int initialCapacity) {
96          data = new HashMap < O, DocumentElement < O > >(initialCapacity);
97          this.name = id;
98          wordCountMultiSet = 0;
99      }
100 
101     /**
102      * Adds a word to the document.
103      * @param word
104      *                The word that will be added.
105      */
106     public void addWord(O word) {
107         DocumentElement < O > r = data.get(word);
108         if (r == null) {
109             // this is the first time we add this word, so
110             // we should initialize the counter for "word"
111             r = new DocumentElement < O >(word, new IntegerHolder(0));
112             data.put(word, r);
113         }
114         // increment the number of words in the document.
115         r.inc();
116         wordCountMultiSet++;
117     }
118     
119     /**
120      * Sets the multiplicity for the given word. 
121      * @param word
122      * @param multiplicity
123      */
124     public void setWord(O word, int multiplicity){
125         // we cannot have an existing word here, because the Furia-chan file format
126         // holds one item per line. All the items are different.
127         assert data.get(word) == null; 
128         data.put(word, new DocumentElement<O>(word, new IntegerHolder(multiplicity)));
129         wordCountMultiSet += multiplicity;
130     }
131 
132     /**
133      * @return An iterator with all the elements of this document.
134      */
135     public Iterator < DocumentElement < O >> iterator() {
136         return data.values().iterator();
137     }
138 
139     /**
140      * This class is used by the iterator of the Document class. It holds the O
141      * object and the number of times it appears in this document.
142      * @param <O>
143      *                The type of OB object that will be stored in this
144      *                document.
145      * @author Arnoldo Jose Muller Molina
146      * @since 0
147      */
148     public class DocumentElement < T > {
149         private T object;
150 
151         private IntegerHolder count;
152 
153         public DocumentElement(T object, IntegerHolder count) {
154             super();
155             this.object = object;
156             this.count = count;
157         }
158 
159         /**
160          * @return The object that composes this element of the document.
161          */
162         public T getObject() {
163             return object;
164         }
165 
166         /**
167          * @return The # of times this object has appeared in the document.
168          */
169         public int getCount() {
170             return count.getValue();
171         }
172 
173         /**
174          * Increments the count for object.
175          */
176         protected void inc() {
177             count.inc();
178         }
179     }
180 
181 }