Attachment 'deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory.patch'
Download 1 diff -Naur deduplicator-0.3.0-20061218/pom.xml deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/pom.xml
2 --- deduplicator-0.3.0-20061218/pom.xml 2006-12-18 09:15:54.000000000 +0100
3 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/pom.xml 2008-05-22 18:21:38.000000000 +0200
4 @@ -64,6 +64,14 @@
5 <locales>en</locales>
6 </configuration>
7 </plugin>
8 + <plugin>
9 + <groupId>org.apache.maven.plugins</groupId>
10 + <artifactId>maven-compiler-plugin</artifactId>
11 + <configuration>
12 + <source>1.5</source>
13 + <target>1.5</target>
14 + </configuration>
15 + </plugin>
16 </plugins>
17 </build>
18
19 @@ -142,4 +150,4 @@
20 </plugins>
21 </reporting>
22
23 -</project>
24 \ No newline at end of file
25 +</project>
26 diff -Naur deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java
27 --- deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java 1970-01-01 01:00:00.000000000 +0100
28 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java 2008-05-22 17:14:40.000000000 +0200
29 @@ -0,0 +1,401 @@
30 +/* $Id: SparseBitSet.java 379 2008-05-22 13:14:35Z kfc $
31 + * $Revision: 379 $
32 + * $Date: 2008-05-22 15:14:35 +0200 (Thu, 22 May 2008) $
33 + * $Author: kfc $
34 + *
35 + * The Netarchive Suite - Software to harvest and preserve websites
36 + * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
37 + *
38 + * This library is free software; you can redistribute it and/or
39 + * modify it under the terms of the GNU Lesser General Public
40 + * License as published by the Free Software Foundation; either
41 + * version 2.1 of the License, or (at your option) any later version.
42 + *
43 + * This library is distributed in the hope that it will be useful,
44 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
45 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
46 + * Lesser General Public License for more details.
47 + *
48 + * You should have received a copy of the GNU Lesser General Public
49 + * License along with this library; if not, write to the Free Software
50 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
51 + */
52 +package dk.netarkivet.common.utils;
53 +
54 +import java.util.BitSet;
55 +import java.util.HashSet;
56 +import java.util.Set;
57 +
58 +/** A sparse implementation of a BitSet, that does not require memory linear
59 + * to the largest index. This is done at the cost of performance, but should
60 + * be fairly efficient on few set bits. */
61 +public class SparseBitSet extends BitSet {
62 + /** A set of the indices of bits that are set in this BitSet. */
63 + private Set<Integer> setbits = new HashSet<Integer>();
64 +
65 + /**
66 + * Initialise the bitset.
67 + */
68 + public SparseBitSet() {
69 + super(0);
70 + }
71 +
72 + /**
73 + * @see BitSet#flip(int)
74 + */
75 + public void flip(int bitIndex) {
76 + if (bitIndex < 0) {
77 + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
78 + }
79 + if (setbits.contains(bitIndex)) {
80 + setbits.remove(bitIndex);
81 + } else {
82 + setbits.add(bitIndex);
83 + }
84 + }
85 +
86 + /**
87 + * @see BitSet#flip(int, int)
88 + */
89 + public void flip(int fromIndex, int toIndex) {
90 + if (fromIndex < 0) {
91 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
92 + }
93 + if (toIndex < 0) {
94 + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
95 + }
96 + if (fromIndex > toIndex) {
97 + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
98 + " > toIndex: " + toIndex);
99 + }
100 + for (int i = fromIndex; i < toIndex; i++) {
101 + flip(i);
102 + }
103 + }
104 +
105 + /**
106 + * @see BitSet#set(int)
107 + */
108 + public void set(int bitIndex) {
109 + if (bitIndex < 0) {
110 + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
111 + }
112 + setbits.add(bitIndex);
113 + }
114 +
115 + /**
116 + * @see BitSet#set(int, boolean)
117 + */
118 + public void set(int bitIndex, boolean value) {
119 + if (bitIndex < 0) {
120 + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
121 + }
122 + if (value) {
123 + setbits.add(bitIndex);
124 + } else {
125 + setbits.remove(bitIndex);
126 + }
127 + }
128 +
129 + /**
130 + * @see BitSet#set(int, int)
131 + */
132 + public void set(int fromIndex, int toIndex) {
133 + if (fromIndex < 0) {
134 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
135 + }
136 + if (toIndex < 0) {
137 + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
138 + }
139 + if (fromIndex > toIndex) {
140 + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
141 + " > toIndex: " + toIndex);
142 + }
143 + for (int i = fromIndex; i < toIndex; i++) {
144 + set(i);
145 + }
146 + }
147 +
148 + /**
149 + * @see BitSet#set(int, int, boolean)
150 + */
151 + public void set(int fromIndex, int toIndex, boolean value) {
152 + if (fromIndex < 0) {
153 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
154 + }
155 + if (toIndex < 0) {
156 + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
157 + }
158 + if (fromIndex > toIndex) {
159 + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
160 + " > toIndex: " + toIndex);
161 + }
162 + for (int i = fromIndex; i < toIndex; i++) {
163 + set(i, value);
164 + }
165 + }
166 +
167 + /**
168 + * @see BitSet#clear(int)
169 + */
170 + public void clear(int bitIndex) {
171 + if (bitIndex < 0) {
172 + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
173 + }
174 + setbits.remove(bitIndex);
175 + }
176 +
177 + /**
178 + * @see BitSet#clear(int, int)
179 + */
180 + public void clear(int fromIndex, int toIndex) {
181 + if (fromIndex < 0) {
182 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
183 + }
184 + if (toIndex < 0) {
185 + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
186 + }
187 + if (fromIndex > toIndex) {
188 + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
189 + " > toIndex: " + toIndex);
190 + }
191 + for (int i = fromIndex; i < toIndex; i++) {
192 + clear(i);
193 + }
194 + }
195 +
196 + /**
197 + * @see BitSet#clear()
198 + */
199 + public void clear() {
200 + setbits.clear();
201 + }
202 +
203 + /**
204 + * @see BitSet#get(int)
205 + */
206 + public boolean get(int bitIndex) {
207 + if (bitIndex < 0) {
208 + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
209 + }
210 + return (setbits.contains(bitIndex));
211 + }
212 +
213 + /**
214 + * @see BitSet#get(int, int)
215 + */
216 + public BitSet get(int fromIndex, int toIndex) {
217 + if (fromIndex < 0) {
218 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
219 + }
220 + if (toIndex < 0) {
221 + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
222 + }
223 + if (fromIndex > toIndex) {
224 + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
225 + " > toIndex: " + toIndex);
226 + }
227 + SparseBitSet bitsubset = new SparseBitSet();
228 + for (int i : setbits) {
229 + if (i >= fromIndex && i < toIndex) {
230 + bitsubset.set(i - fromIndex);
231 + }
232 + }
233 + return bitsubset;
234 + }
235 +
236 + /**
237 + * @see BitSet#nextSetBit(int)
238 + */
239 + public int nextSetBit(int fromIndex) {
240 + if (fromIndex < 0) {
241 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
242 + }
243 + int index = -1;
244 + for (Integer i : setbits) {
245 + if ((i >= fromIndex) && ((index == -1) || (i < index))) {
246 + index = i;
247 + }
248 + }
249 + return index;
250 + }
251 +
252 + /**
253 + * @see BitSet#nextClearBit(int)
254 + */
255 + public int nextClearBit(int fromIndex) {
256 + if (fromIndex < 0) {
257 + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
258 + }
259 + for (int i = fromIndex; i > 0; i++) {
260 + if (!get(i)) {
261 + return i;
262 + }
263 + }
264 + return Integer.MIN_VALUE;
265 + }
266 +
267 + /**
268 + * @see BitSet#length()
269 + */
270 + public int length() {
271 + int index = -1;
272 + for (Integer i : setbits) {
273 + if (i > index) {
274 + index = i;
275 + }
276 + }
277 + return index + 1;
278 + }
279 +
280 + /**
281 + * @see BitSet#isEmpty()
282 + */
283 + public boolean isEmpty() {
284 + return setbits.isEmpty();
285 + }
286 +
287 + /**
288 + * @see BitSet#intersects(BitSet)
289 + */
290 + public boolean intersects(BitSet set) {
291 + for (Integer index : setbits) {
292 + if (set.get(index)) {
293 + return true;
294 + }
295 + }
296 + return false;
297 + }
298 +
299 +
300 + /**
301 + * @see BitSet#cardinality()
302 + */
303 + public int cardinality() {
304 + return setbits.size();
305 + }
306 +
307 + /**
308 + * @see BitSet#and(BitSet)
309 + */
310 + public void and(BitSet set) {
311 + Set<Integer> andbits = new HashSet<Integer>();
312 + for (Integer index : setbits) {
313 + if (set.get(index)) {
314 + andbits.add(index);
315 + }
316 + }
317 + setbits = andbits;
318 + }
319 +
320 + /**
321 + * @see BitSet#or(BitSet)
322 + */
323 + public void or(BitSet set) {
324 + Set<Integer> orbits = new HashSet<Integer>(setbits);
325 + for (int index = set.nextSetBit(0); index != -1;
326 + index = set.nextSetBit(index + 1)) {
327 + orbits.add(index);
328 + }
329 + setbits = orbits;
330 + }
331 +
332 + /**
333 + * @see BitSet#xor(BitSet)
334 + */
335 + public void xor(BitSet set) {
336 + Set<Integer> xorbits = new HashSet<Integer>();
337 + for (Integer index : setbits) {
338 + if (!set.get(index)) {
339 + xorbits.add(index);
340 + }
341 + }
342 + for (int index = set.nextSetBit(0); index != -1;
343 + index = set.nextSetBit(index + 1)) {
344 + if (!setbits.contains(index)) {
345 + xorbits.add(index);
346 + }
347 + }
348 + setbits = xorbits;
349 + }
350 +
351 + /**
352 + * @see BitSet#andNot(BitSet)
353 + */
354 + public void andNot(BitSet set) {
355 + Set<Integer> andnotbits = new HashSet<Integer>(setbits);
356 + for (Integer index : setbits) {
357 + if (set.get(index)) {
358 + andnotbits.remove(index);
359 + }
360 + }
361 + setbits = andnotbits;
362 + }
363 +
364 + /**
365 + * A hash code for this bit set.
366 + * Note: The hash codes are not implemented to be compatible with
367 + * java.util.BitSet#hashCode().
368 + */
369 + public int hashCode() {
370 + return setbits.hashCode();
371 + }
372 +
373 + /**
374 + * In contrast with {@link java.util.BitSet#size()} this does not return the
375 + * size in bytes used to represent this set. Instead, it returns the
376 + * same as {@link #length()} for compatibility with
377 + * {@link java.util.BitSet}. The actual space used is a hashset of size
378 + * {@link #cardinality()}.
379 + */
380 + public int size() {
381 + return length();
382 + }
383 +
384 + /**
385 + * Two SparseBitSets are considered equal if they contain the same bits.
386 + *
387 + * Note: Equality is not implemented to be compatible with
388 + * java.util.BitSet#hashCode().
389 + */
390 + public boolean equals(Object obj) {
391 + if (obj instanceof SparseBitSet) {
392 + return setbits.equals(((SparseBitSet) obj).setbits);
393 + } else if (obj instanceof BitSet) {
394 +// NOTE: Do not re-add this code, unless you reimplement hashcode
395 +// to be consistent with java.util.BitSet
396 +// Do not mix java.util.BitSet with SparseBitSet in containers
397 +// until this code is implemented.
398 +// BitSet bitset = (BitSet) obj;
399 +// if (cardinality() != bitset.cardinality()) {
400 +// return false;
401 +// }
402 +// for (int i : setbits) {
403 +// if (!bitset.get(i)) {
404 +// return false;
405 +// }
406 +// }
407 +// return true;
408 + return false;
409 + } else {
410 + return false;
411 + }
412 + }
413 +
414 + /**
415 + * @see BitSet#clone()
416 + */
417 + public Object clone() {
418 + super.clone();
419 + SparseBitSet newSparseBitSet = new SparseBitSet();
420 + newSparseBitSet.setbits = new HashSet<Integer>(setbits);
421 + return newSparseBitSet;
422 + }
423 +
424 + /**
425 + * @see BitSet#toString()
426 + */
427 + public String toString() {
428 + return setbits.toString();
429 + }
430 +}
431 diff -Naur deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java
432 --- deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java 1970-01-01 01:00:00.000000000 +0100
433 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java 2008-05-22 17:14:40.000000000 +0200
434 @@ -0,0 +1,134 @@
435 +/* $Id: SparseRangeFilter.java 379 2008-05-22 13:14:35Z kfc $
436 + * $Revision: 379 $
437 + * $Date: 2008-05-22 15:14:35 +0200 (Thu, 22 May 2008) $
438 + * $Author: kfc $
439 + *
440 + * The Netarchive Suite - Software to harvest and preserve websites
441 + * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
442 + *
443 + * This library is free software; you can redistribute it and/or
444 + * modify it under the terms of the GNU Lesser General Public
445 + * License as published by the Free Software Foundation; either
446 + * version 2.1 of the License, or (at your option) any later version.
447 + *
448 + * This library is distributed in the hope that it will be useful,
449 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
450 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
451 + * Lesser General Public License for more details.
452 + *
453 + * You should have received a copy of the GNU Lesser General Public
454 + * License along with this library; if not, write to the Free Software
455 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
456 + */
457 +/**
458 + * Copyright 2004 The Apache Software Foundation
459 + *
460 + * Licensed under the Apache License, Version 2.0 (the "License");
461 + * you may not use this file except in compliance with the License.
462 + * You may obtain a copy of the License at
463 + *
464 + * http://www.apache.org/licenses/LICENSE-2.0
465 + *
466 + * Unless required by applicable law or agreed to in writing, software
467 + * distributed under the License is distributed on an "AS IS" BASIS,
468 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
469 + * See the License for the specific language governing permissions and
470 + * limitations under the License.
471 + */
472 +package dk.netarkivet.common.utils;
473 +
474 +import java.io.IOException;
475 +
476 +import org.apache.lucene.index.IndexReader;
477 +import org.apache.lucene.index.Term;
478 +import org.apache.lucene.index.TermDocs;
479 +import org.apache.lucene.index.TermEnum;
480 +import org.apache.lucene.search.RangeFilter;
481 +
482 +/**
483 + * Identical subclass of {@link RangeFilter}, except the bitset returned by
484 + * {@link #bits(IndexReader)} is sparse.
485 + */
486 +public class SparseRangeFilter extends RangeFilter {
487 + private String fieldName;
488 + private String lowerTerm;
489 + private String upperTerm;
490 + private boolean includeLower;
491 + private boolean includeUpper;
492 +
493 + /**
494 + * @see RangeFilter#RangeFilter(String, String, String, boolean, boolean)
495 + */
496 + public SparseRangeFilter(String fieldName, String lowerTerm,
497 + String upperTerm, boolean includeLower,
498 + boolean includeUpper) {
499 + super(fieldName, lowerTerm, upperTerm, includeLower, includeUpper);
500 + this.fieldName = fieldName;
501 + this.lowerTerm = lowerTerm;
502 + this.upperTerm = upperTerm;
503 + this.includeLower = includeLower;
504 + this.includeUpper = includeUpper;
505 + }
506 +
507 + /**
508 + * Identical to {@link RangeFilter#bits(IndexReader)}, except a SparseBitSet
509 + * is returned.
510 + * @see RangeFilter#bits(IndexReader)
511 + */
512 + public SparseBitSet bits(IndexReader reader) throws IOException {
513 + SparseBitSet bits = new SparseBitSet();
514 + TermEnum enumerator =
515 + (null != lowerTerm
516 + ? reader.terms(new Term(fieldName, lowerTerm))
517 + : reader.terms(new Term(fieldName,"")));
518 +
519 + try {
520 +
521 + if (enumerator.term() == null) {
522 + return bits;
523 + }
524 +
525 + boolean checkLower = false;
526 + if (!includeLower) // make adjustments to set to exclusive
527 + checkLower = true;
528 +
529 + TermDocs termDocs = reader.termDocs();
530 + try {
531 +
532 + do {
533 + Term term = enumerator.term();
534 + if (term != null && term.field().equals(fieldName)) {
535 + if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) {
536 + checkLower = false;
537 + if (upperTerm != null) {
538 + int compare = upperTerm.compareTo(term.text());
539 + /* if beyond the upper term, or is exclusive and
540 + * this is equal to the upper term, break out */
541 + if ((compare < 0) ||
542 + (!includeUpper && compare==0)) {
543 + break;
544 + }
545 + }
546 + /* we have a good term, find the docs */
547 +
548 + termDocs.seek(enumerator.term());
549 + while (termDocs.next()) {
550 + bits.set(termDocs.doc());
551 + }
552 + }
553 + } else {
554 + break;
555 + }
556 + }
557 + while (enumerator.next());
558 +
559 + } finally {
560 + termDocs.close();
561 + }
562 + } finally {
563 + enumerator.close();
564 + }
565 +
566 + return bits;
567 + }
568 +}
569 diff -Naur deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java
570 --- deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java 2006-08-09 10:12:50.000000000 +0200
571 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java 2008-05-22 18:24:32.000000000 +0200
572 @@ -27,6 +27,7 @@
573 import java.util.logging.Level;
574 import java.util.logging.Logger;
575
576 +import dk.netarkivet.common.utils.SparseRangeFilter;
577 import org.apache.commons.httpclient.HttpConnection;
578 import org.apache.commons.httpclient.HttpMethod;
579 import org.apache.lucene.document.Document;
580 @@ -34,7 +35,7 @@
581 import org.apache.lucene.search.Hits;
582 import org.apache.lucene.search.IndexSearcher;
583 import org.apache.lucene.search.Query;
584 -import org.apache.lucene.search.TermQuery;
585 +import org.apache.lucene.search.ConstantScoreQuery;
586 import org.archive.crawler.datamodel.CrawlURI;
587 import org.archive.crawler.fetcher.FetchHTTP;
588 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
589 @@ -310,8 +311,9 @@
590 */
591 protected Document lookup(CrawlURI curi) {
592 try{
593 - Query query = new TermQuery(new Term(
594 - DigestIndexer.FIELD_URL,curi.toString()));
595 + Query query = new ConstantScoreQuery(new SparseRangeFilter(
596 + DigestIndexer.FIELD_URL,curi.toString(),curi.toString(),
597 + true,true));
598 Hits hits = index.search(query);
599 Document doc = null;
600 if(hits != null && hits.length() > 0){
601 diff -Naur deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java
602 --- deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java 2006-08-22 12:26:54.000000000 +0200
603 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java 2008-05-22 18:22:06.000000000 +0200
604 @@ -32,13 +32,14 @@
605 import java.util.logging.Level;
606 import java.util.logging.Logger;
607
608 +import dk.netarkivet.common.utils.SparseRangeFilter;
609 import org.apache.commons.httpclient.HttpMethod;
610 import org.apache.lucene.document.Document;
611 import org.apache.lucene.index.Term;
612 import org.apache.lucene.search.Hits;
613 import org.apache.lucene.search.IndexSearcher;
614 import org.apache.lucene.search.Query;
615 -import org.apache.lucene.search.TermQuery;
616 +import org.apache.lucene.search.ConstantScoreQuery;
617 import org.archive.crawler.datamodel.CoreAttributeConstants;
618 import org.archive.crawler.datamodel.CrawlOrder;
619 import org.archive.crawler.datamodel.CrawlURI;
620 @@ -570,8 +571,8 @@
621 protected Document lookupByURL(CrawlURI curi, Statistics currHostStats){
622 // Look the CrawlURI's URL up in the index.
623 try {
624 - Query query = new TermQuery(
625 - new Term(DigestIndexer.FIELD_URL,curi.toString()));
626 + Query query = queryField(DigestIndexer.FIELD_URL,
627 + curi.toString());
628 Hits hits = index.search(query);
629 Document doc = null;
630 String currentDigest = getDigestAsString(curi);
631 @@ -601,9 +602,8 @@
632 if(equivalent) {
633 // No exact hits. Let's try lenient matching.
634 String normalizedURL = DigestIndexer.stripURL(curi.toString());
635 - query = new TermQuery(new Term(
636 - DigestIndexer.FIELD_URL_NORMALIZED,
637 - normalizedURL));
638 + query = queryField(DigestIndexer.FIELD_URL_NORMALIZED,
639 + normalizedURL);
640 hits = index.search(query);
641 for(int i=0 ; i<hits.length() ; i++){
642 String indexDigest = hits.doc(i).get(DigestIndexer.FIELD_DIGEST);
643 @@ -650,7 +650,7 @@
644 if (digest != null) {
645 currentDigest = Base32.encode((byte[])digest);
646 }
647 - Query query = new TermQuery(new Term(DigestIndexer.FIELD_DIGEST,currentDigest));
648 + Query query = queryField(DigestIndexer.FIELD_DIGEST, currentDigest);
649 try {
650 Hits hits = index.search(query);
651 StringBuffer mirrors = new StringBuffer();
652 @@ -832,8 +832,8 @@
653 protected void doAnalysis(CrawlURI curi, Statistics currHostStats,
654 boolean isDuplicate) {
655 try{
656 - Query query = new TermQuery(new Term(
657 - DigestIndexer.FIELD_URL,curi.toString()));
658 + Query query = queryField(DigestIndexer.FIELD_URL,
659 + curi.toString());
660 Hits hits = index.search(query);
661 Document doc = null;
662 if(hits != null && hits.length() > 0){
663 @@ -946,6 +946,20 @@
664
665 }
666
667 + /** Run a simple Lucene query for a single term in a single field.
668 + *
669 + * This method does not load norms, so its memory usage is less than
670 + * O(n) where n is the total number of entries in the index.
671 + *
672 + * @param fieldName name of the field to look in.
673 + * @param value The value to query for
674 + * @returns A Query for the given value in the given field.
675 + */
676 + protected Query queryField(String fieldName, String value) {
677 + return new ConstantScoreQuery(
678 + new SparseRangeFilter(fieldName, value, value, true, true));
679 + }
680 +
681 protected void finalTasks() {
682 try {
683 index.close();
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.