Attachment 'deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory.patch'

Download

   1 diff -Naur deduplicator-0.3.0-20061218/pom.xml deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/pom.xml
   2 --- deduplicator-0.3.0-20061218/pom.xml	2006-12-18 09:15:54.000000000 +0100
   3 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/pom.xml	2008-05-22 18:21:38.000000000 +0200
   4 @@ -64,6 +64,14 @@
   5            <locales>en</locales>
   6          </configuration>
   7        </plugin>
   8 +      <plugin>
   9 +        <groupId>org.apache.maven.plugins</groupId>
  10 +          <artifactId>maven-compiler-plugin</artifactId>
  11 +          <configuration>
  12 +          <source>1.5</source>
  13 +          <target>1.5</target>
  14 +          </configuration>
  15 +        </plugin>
  16      </plugins>
  17    </build>
  18    
  19 @@ -142,4 +150,4 @@
  20      </plugins>
  21    </reporting>
  22    
  23 -</project>
  24 \ No newline at end of file
  25 +</project>
  26 diff -Naur deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java
  27 --- deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java	1970-01-01 01:00:00.000000000 +0100
  28 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseBitSet.java	2008-05-22 17:14:40.000000000 +0200
  29 @@ -0,0 +1,401 @@
  30 +/* $Id: SparseBitSet.java 379 2008-05-22 13:14:35Z kfc $
  31 + * $Revision: 379 $
  32 + * $Date: 2008-05-22 15:14:35 +0200 (Thu, 22 May 2008) $
  33 + * $Author: kfc $
  34 + *
  35 + * The Netarchive Suite - Software to harvest and preserve websites
  36 + * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
  37 + *
  38 + * This library is free software; you can redistribute it and/or
  39 + * modify it under the terms of the GNU Lesser General Public
  40 + * License as published by the Free Software Foundation; either
  41 + * version 2.1 of the License, or (at your option) any later version.
  42 + *
  43 + * This library is distributed in the hope that it will be useful,
  44 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  45 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  46 + * Lesser General Public License for more details.
  47 + *
  48 + * You should have received a copy of the GNU Lesser General Public
  49 + * License along with this library; if not, write to the Free Software
  50 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  51 + */
  52 +package dk.netarkivet.common.utils;
  53 +
  54 +import java.util.BitSet;
  55 +import java.util.HashSet;
  56 +import java.util.Set;
  57 +
  58 +/** A sparse implementation of a BitSet, that does not require memory linear
  59 + * to the largest index. This is done at the cost of performance, but should
  60 + * be fairly efficient on few set bits. */
  61 +public class SparseBitSet extends BitSet {
  62 +    /** A set of the indices of bits that are set in this BitSet. */
  63 +    private Set<Integer> setbits = new HashSet<Integer>();
  64 +
  65 +    /**
  66 +     * Initialise the bitset.
  67 +     */
  68 +    public SparseBitSet() {
  69 +        super(0);
  70 +    }
  71 +
  72 +    /**
  73 +     * @see BitSet#flip(int)
  74 +     */
  75 +    public void flip(int bitIndex) {
  76 +        if (bitIndex < 0) {
  77 +            throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
  78 +        }
  79 +        if (setbits.contains(bitIndex)) {
  80 +            setbits.remove(bitIndex);
  81 +        } else {
  82 +            setbits.add(bitIndex);
  83 +        }
  84 +    }
  85 +
  86 +    /**
  87 +     * @see BitSet#flip(int, int)
  88 +     */
  89 +    public void flip(int fromIndex, int toIndex) {
  90 +        if (fromIndex < 0) {
  91 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
  92 +        }
  93 +        if (toIndex < 0) {
  94 +            throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
  95 +        }
  96 +        if (fromIndex > toIndex) {
  97 +            throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
  98 +                                                    " > toIndex: " + toIndex);
  99 +        }
 100 +        for (int i = fromIndex; i < toIndex; i++) {
 101 +            flip(i);
 102 +        }
 103 +    }
 104 +
 105 +    /**
 106 +     * @see BitSet#set(int)
 107 +     */
 108 +    public void set(int bitIndex) {
 109 +        if (bitIndex < 0) {
 110 +            throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
 111 +        }
 112 +        setbits.add(bitIndex);
 113 +    }
 114 +
 115 +    /**
 116 +     * @see BitSet#set(int, boolean)
 117 +     */
 118 +    public void set(int bitIndex, boolean value) {
 119 +        if (bitIndex < 0) {
 120 +            throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
 121 +        }
 122 +        if (value) {
 123 +            setbits.add(bitIndex);
 124 +        } else {
 125 +            setbits.remove(bitIndex);
 126 +        }
 127 +    }
 128 +
 129 +    /**
 130 +     * @see BitSet#set(int, int)
 131 +     */
 132 +    public void set(int fromIndex, int toIndex) {
 133 +        if (fromIndex < 0) {
 134 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 135 +        }
 136 +        if (toIndex < 0) {
 137 +            throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
 138 +        }
 139 +        if (fromIndex > toIndex) {
 140 +            throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
 141 +                                                    " > toIndex: " + toIndex);
 142 +        }
 143 +        for (int i = fromIndex; i < toIndex; i++) {
 144 +            set(i);
 145 +        }
 146 +    }
 147 +
 148 +    /**
 149 +     * @see BitSet#set(int, int, boolean)
 150 +     */
 151 +    public void set(int fromIndex, int toIndex, boolean value) {
 152 +        if (fromIndex < 0) {
 153 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 154 +        }
 155 +        if (toIndex < 0) {
 156 +            throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
 157 +        }
 158 +        if (fromIndex > toIndex) {
 159 +            throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
 160 +                                                    " > toIndex: " + toIndex);
 161 +        }
 162 +        for (int i = fromIndex; i < toIndex; i++) {
 163 +            set(i, value);
 164 +        }
 165 +    }
 166 +
 167 +    /**
 168 +     * @see BitSet#clear(int)
 169 +     */
 170 +    public void clear(int bitIndex) {
 171 +        if (bitIndex < 0) {
 172 +            throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
 173 +        }
 174 +        setbits.remove(bitIndex);
 175 +    }
 176 +
 177 +    /**
 178 +     * @see BitSet#clear(int, int)
 179 +     */
 180 +    public void clear(int fromIndex, int toIndex) {
 181 +        if (fromIndex < 0) {
 182 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 183 +        }
 184 +        if (toIndex < 0) {
 185 +            throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
 186 +        }
 187 +        if (fromIndex > toIndex) {
 188 +            throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
 189 +                                                    " > toIndex: " + toIndex);
 190 +        }
 191 +        for (int i = fromIndex; i < toIndex; i++) {
 192 +            clear(i);
 193 +        }
 194 +    }
 195 +
 196 +    /**
 197 +     * @see BitSet#clear()
 198 +     */
 199 +    public void clear() {
 200 +        setbits.clear();
 201 +    }
 202 +
 203 +    /**
 204 +     * @see BitSet#get(int)
 205 +     */
 206 +    public boolean get(int bitIndex) {
 207 +        if (bitIndex < 0) {
 208 +            throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
 209 +        }
 210 +        return (setbits.contains(bitIndex));
 211 +    }
 212 +
 213 +    /**
 214 +     * @see BitSet#get(int, int)
 215 +     */
 216 +    public BitSet get(int fromIndex, int toIndex) {
 217 +        if (fromIndex < 0) {
 218 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 219 +        }
 220 +        if (toIndex < 0) {
 221 +            throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex);
 222 +        }
 223 +        if (fromIndex > toIndex) {
 224 +            throw new IndexOutOfBoundsException("fromIndex: " + fromIndex +
 225 +                                                    " > toIndex: " + toIndex);
 226 +        }
 227 +        SparseBitSet bitsubset = new SparseBitSet();
 228 +        for (int i : setbits) {
 229 +            if (i >= fromIndex && i < toIndex) {
 230 +                bitsubset.set(i - fromIndex);
 231 +            }
 232 +        }
 233 +        return bitsubset;
 234 +    }
 235 +
 236 +    /**
 237 +     * @see BitSet#nextSetBit(int)
 238 +     */
 239 +    public int nextSetBit(int fromIndex) {
 240 +        if (fromIndex < 0) {
 241 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 242 +        }
 243 +        int index = -1;
 244 +        for (Integer i : setbits) {
 245 +            if ((i >= fromIndex) && ((index == -1) || (i < index))) {
 246 +                index = i;
 247 +            }
 248 +        }
 249 +        return index;
 250 +    }
 251 +
 252 +    /**
 253 +     * @see BitSet#nextClearBit(int)
 254 +     */
 255 +    public int nextClearBit(int fromIndex) {
 256 +        if (fromIndex < 0) {
 257 +            throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex);
 258 +        }
 259 +        for (int i = fromIndex; i > 0; i++) {
 260 +            if (!get(i)) {
 261 +                return i;
 262 +            }
 263 +        }
 264 +        return Integer.MIN_VALUE;
 265 +    }
 266 +
 267 +    /**
 268 +     * @see BitSet#length()
 269 +     */
 270 +    public int length() {
 271 +        int index = -1;
 272 +        for (Integer i : setbits) {
 273 +            if (i > index) {
 274 +                index = i;
 275 +            }
 276 +        }
 277 +        return index + 1;
 278 +    }
 279 +
 280 +    /**
 281 +     * @see BitSet#isEmpty()
 282 +     */
 283 +    public boolean isEmpty() {
 284 +        return setbits.isEmpty();
 285 +    }
 286 +
 287 +    /**
 288 +     * @see BitSet#intersects(BitSet)
 289 +     */
 290 +    public boolean intersects(BitSet set) {
 291 +        for (Integer index : setbits) {
 292 +            if (set.get(index)) {
 293 +                return true;
 294 +            }
 295 +        }
 296 +        return false;
 297 +    }
 298 +
 299 +
 300 +    /**
 301 +     * @see BitSet#cardinality()
 302 +     */
 303 +    public int cardinality() {
 304 +        return setbits.size();
 305 +    }
 306 +
 307 +    /**
 308 +     * @see BitSet#and(BitSet)
 309 +     */
 310 +    public void and(BitSet set) {
 311 +        Set<Integer> andbits = new HashSet<Integer>();
 312 +        for (Integer index : setbits) {
 313 +            if (set.get(index)) {
 314 +                andbits.add(index);
 315 +            }
 316 +        }
 317 +        setbits = andbits;
 318 +    }
 319 +
 320 +    /**
 321 +     * @see BitSet#or(BitSet)
 322 +     */
 323 +    public void or(BitSet set) {
 324 +        Set<Integer> orbits = new HashSet<Integer>(setbits);
 325 +        for (int index = set.nextSetBit(0); index != -1;
 326 +             index = set.nextSetBit(index + 1)) {
 327 +            orbits.add(index);
 328 +        }
 329 +        setbits = orbits;
 330 +    }
 331 +
 332 +    /**
 333 +     * @see BitSet#xor(BitSet)
 334 +     */
 335 +    public void xor(BitSet set) {
 336 +        Set<Integer> xorbits = new HashSet<Integer>();
 337 +        for (Integer index : setbits) {
 338 +            if (!set.get(index)) {
 339 +                xorbits.add(index);
 340 +            }
 341 +        }
 342 +        for (int index = set.nextSetBit(0); index != -1;
 343 +             index = set.nextSetBit(index + 1)) {
 344 +            if (!setbits.contains(index)) {
 345 +                xorbits.add(index);
 346 +            }
 347 +        }
 348 +        setbits = xorbits;
 349 +    }
 350 +
 351 +    /**
 352 +     * @see BitSet#andNot(BitSet)
 353 +     */
 354 +    public void andNot(BitSet set) {
 355 +        Set<Integer> andnotbits = new HashSet<Integer>(setbits);
 356 +        for (Integer index : setbits) {
 357 +            if (set.get(index)) {
 358 +                andnotbits.remove(index);
 359 +            }
 360 +        }
 361 +        setbits = andnotbits;
 362 +    }
 363 +
 364 +    /**
 365 +     * A hash code for this bit set.
 366 +     * Note: The hash codes are not implemented to be compatible with
 367 +     * java.util.BitSet#hashCode().
 368 +     */
 369 +    public int hashCode() {
 370 +        return setbits.hashCode();
 371 +    }
 372 +
 373 +    /**
 374 +     * In contrast with {@link java.util.BitSet#size()} this does not return the
 375 +     * size in bytes used to represent this set. Instead, it returns the
 376 +     * same as {@link #length()} for compatibility with
 377 +     * {@link java.util.BitSet}. The actual space used is a hashset of size
 378 +     * {@link #cardinality()}.
 379 +     */
 380 +    public int size() {
 381 +        return length();
 382 +    }
 383 +
 384 +    /**
 385 +     * Two SparseBitSets are considered equal if they contain the same bits.
 386 +     *
 387 +     * Note: Equality is not implemented to be compatible with
 388 +     * java.util.BitSet#hashCode().
 389 +     */
 390 +    public boolean equals(Object obj) {
 391 +        if (obj instanceof SparseBitSet) {
 392 +            return setbits.equals(((SparseBitSet) obj).setbits);
 393 +        } else if (obj instanceof BitSet) {
 394 +//        NOTE: Do not re-add this code, unless you reimplement hashcode
 395 +//              to be consistent with java.util.BitSet
 396 +//              Do not mix java.util.BitSet with SparseBitSet in containers
 397 +//              until this code is implemented.
 398 +//            BitSet bitset = (BitSet) obj;
 399 +//            if (cardinality() != bitset.cardinality()) {
 400 +//                return false;
 401 +//            }
 402 +//            for (int i : setbits) {
 403 +//                if (!bitset.get(i)) {
 404 +//                    return false;
 405 +//                }
 406 +//            }
 407 +//            return true;
 408 +            return false;
 409 +        } else {
 410 +            return false;
 411 +        }
 412 +    }
 413 +
 414 +    /**
 415 +     * @see BitSet#clone()
 416 +     */
 417 +    public Object clone() {
 418 +        super.clone();
 419 +        SparseBitSet newSparseBitSet = new SparseBitSet();
 420 +        newSparseBitSet.setbits = new HashSet<Integer>(setbits);
 421 +        return newSparseBitSet;
 422 +    }
 423 +
 424 +    /**
 425 +     * @see BitSet#toString()
 426 +     */
 427 +    public String toString() {
 428 +        return setbits.toString();
 429 +    }
 430 +}
 431 diff -Naur deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java
 432 --- deduplicator-0.3.0-20061218/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java	1970-01-01 01:00:00.000000000 +0100
 433 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/dk/netarkivet/common/utils/SparseRangeFilter.java	2008-05-22 17:14:40.000000000 +0200
 434 @@ -0,0 +1,134 @@
 435 +/* $Id: SparseRangeFilter.java 379 2008-05-22 13:14:35Z kfc $
 436 + * $Revision: 379 $
 437 + * $Date: 2008-05-22 15:14:35 +0200 (Thu, 22 May 2008) $
 438 + * $Author: kfc $
 439 + *
 440 + * The Netarchive Suite - Software to harvest and preserve websites
 441 + * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
 442 + *
 443 + * This library is free software; you can redistribute it and/or
 444 + * modify it under the terms of the GNU Lesser General Public
 445 + * License as published by the Free Software Foundation; either
 446 + * version 2.1 of the License, or (at your option) any later version.
 447 + *
 448 + * This library is distributed in the hope that it will be useful,
 449 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 450 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 451 + * Lesser General Public License for more details.
 452 + *
 453 + * You should have received a copy of the GNU Lesser General Public
 454 + * License along with this library; if not, write to the Free Software
 455 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 456 + */
 457 +/**
 458 + * Copyright 2004 The Apache Software Foundation
 459 + *
 460 + * Licensed under the Apache License, Version 2.0 (the "License");
 461 + * you may not use this file except in compliance with the License.
 462 + * You may obtain a copy of the License at
 463 + *
 464 + *     http://www.apache.org/licenses/LICENSE-2.0
 465 + *
 466 + * Unless required by applicable law or agreed to in writing, software
 467 + * distributed under the License is distributed on an "AS IS" BASIS,
 468 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 469 + * See the License for the specific language governing permissions and
 470 + * limitations under the License.
 471 + */
 472 +package dk.netarkivet.common.utils;
 473 +
 474 +import java.io.IOException;
 475 +
 476 +import org.apache.lucene.index.IndexReader;
 477 +import org.apache.lucene.index.Term;
 478 +import org.apache.lucene.index.TermDocs;
 479 +import org.apache.lucene.index.TermEnum;
 480 +import org.apache.lucene.search.RangeFilter;
 481 +
 482 +/**
 483 + * Identical subclass of {@link RangeFilter}, except the bitset returned by
 484 + * {@link #bits(IndexReader)} is sparse.
 485 + */
 486 +public class SparseRangeFilter extends RangeFilter {
 487 +    private String fieldName;
 488 +    private String lowerTerm;
 489 +    private String upperTerm;
 490 +    private boolean includeLower;
 491 +    private boolean includeUpper;
 492 +
 493 +    /**
 494 +     * @see RangeFilter#RangeFilter(String, String, String, boolean, boolean)
 495 +     */
 496 +    public SparseRangeFilter(String fieldName, String lowerTerm,
 497 +                             String upperTerm, boolean includeLower,
 498 +                             boolean includeUpper) {
 499 +        super(fieldName, lowerTerm, upperTerm, includeLower, includeUpper);
 500 +        this.fieldName = fieldName;
 501 +        this.lowerTerm = lowerTerm;
 502 +        this.upperTerm = upperTerm;
 503 +        this.includeLower = includeLower;
 504 +        this.includeUpper = includeUpper;
 505 +    }
 506 +
 507 +    /**
 508 +     * Identical to {@link RangeFilter#bits(IndexReader)}, except a SparseBitSet
 509 +     * is returned.
 510 +     * @see RangeFilter#bits(IndexReader)
 511 +     */
 512 +    public SparseBitSet bits(IndexReader reader) throws IOException {
 513 +        SparseBitSet bits = new SparseBitSet();
 514 +        TermEnum enumerator =
 515 +            (null != lowerTerm
 516 +             ? reader.terms(new Term(fieldName, lowerTerm))
 517 +             : reader.terms(new Term(fieldName,"")));
 518 +
 519 +        try {
 520 +
 521 +            if (enumerator.term() == null) {
 522 +                return bits;
 523 +            }
 524 +
 525 +            boolean checkLower = false;
 526 +            if (!includeLower) // make adjustments to set to exclusive
 527 +                checkLower = true;
 528 +
 529 +            TermDocs termDocs = reader.termDocs();
 530 +            try {
 531 +
 532 +                do {
 533 +                    Term term = enumerator.term();
 534 +                    if (term != null && term.field().equals(fieldName)) {
 535 +                        if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) {
 536 +                            checkLower = false;
 537 +                            if (upperTerm != null) {
 538 +                                int compare = upperTerm.compareTo(term.text());
 539 +                                /* if beyond the upper term, or is exclusive and
 540 +                                 * this is equal to the upper term, break out */
 541 +                                if ((compare < 0) ||
 542 +                                    (!includeUpper && compare==0)) {
 543 +                                    break;
 544 +                                }
 545 +                            }
 546 +                            /* we have a good term, find the docs */
 547 +
 548 +                            termDocs.seek(enumerator.term());
 549 +                            while (termDocs.next()) {
 550 +                                bits.set(termDocs.doc());
 551 +                            }
 552 +                        }
 553 +                    } else {
 554 +                        break;
 555 +                    }
 556 +                }
 557 +                while (enumerator.next());
 558 +
 559 +            } finally {
 560 +                termDocs.close();
 561 +            }
 562 +        } finally {
 563 +            enumerator.close();
 564 +        }
 565 +
 566 +        return bits;
 567 +    }
 568 +}
 569 diff -Naur deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java
 570 --- deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java	2006-08-09 10:12:50.000000000 +0200
 571 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDupFetchHTTP.java	2008-05-22 18:24:32.000000000 +0200
 572 @@ -27,6 +27,7 @@
 573  import java.util.logging.Level;
 574  import java.util.logging.Logger;
 575  
 576 +import dk.netarkivet.common.utils.SparseRangeFilter;
 577  import org.apache.commons.httpclient.HttpConnection;
 578  import org.apache.commons.httpclient.HttpMethod;
 579  import org.apache.lucene.document.Document;
 580 @@ -34,7 +35,7 @@
 581  import org.apache.lucene.search.Hits;
 582  import org.apache.lucene.search.IndexSearcher;
 583  import org.apache.lucene.search.Query;
 584 -import org.apache.lucene.search.TermQuery;
 585 +import org.apache.lucene.search.ConstantScoreQuery;
 586  import org.archive.crawler.datamodel.CrawlURI;
 587  import org.archive.crawler.fetcher.FetchHTTP;
 588  import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
 589 @@ -310,8 +311,9 @@
 590       */
 591      protected Document lookup(CrawlURI curi) {
 592          try{
 593 -            Query query = new TermQuery(new Term(
 594 -                    DigestIndexer.FIELD_URL,curi.toString()));
 595 +            Query query = new ConstantScoreQuery(new SparseRangeFilter(
 596 +                    DigestIndexer.FIELD_URL,curi.toString(),curi.toString(),
 597 +                    true,true));
 598              Hits hits = index.search(query);
 599              Document doc = null;
 600              if(hits != null && hits.length() > 0){
 601 diff -Naur deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java
 602 --- deduplicator-0.3.0-20061218/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java	2006-08-22 12:26:54.000000000 +0200
 603 +++ deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java	2008-05-22 18:22:06.000000000 +0200
 604 @@ -32,13 +32,14 @@
 605  import java.util.logging.Level;
 606  import java.util.logging.Logger;
 607  
 608 +import dk.netarkivet.common.utils.SparseRangeFilter;
 609  import org.apache.commons.httpclient.HttpMethod;
 610  import org.apache.lucene.document.Document;
 611  import org.apache.lucene.index.Term;
 612  import org.apache.lucene.search.Hits;
 613  import org.apache.lucene.search.IndexSearcher;
 614  import org.apache.lucene.search.Query;
 615 -import org.apache.lucene.search.TermQuery;
 616 +import org.apache.lucene.search.ConstantScoreQuery;
 617  import org.archive.crawler.datamodel.CoreAttributeConstants;
 618  import org.archive.crawler.datamodel.CrawlOrder;
 619  import org.archive.crawler.datamodel.CrawlURI;
 620 @@ -570,8 +571,8 @@
 621      protected Document lookupByURL(CrawlURI curi, Statistics currHostStats){
 622          // Look the CrawlURI's URL up in the index.
 623          try {
 624 -            Query query = new TermQuery(
 625 -                    new Term(DigestIndexer.FIELD_URL,curi.toString()));
 626 +            Query query = queryField(DigestIndexer.FIELD_URL,
 627 +                curi.toString());
 628              Hits hits = index.search(query);
 629              Document doc = null;
 630              String currentDigest = getDigestAsString(curi);
 631 @@ -601,9 +602,8 @@
 632              if(equivalent) {
 633                  // No exact hits. Let's try lenient matching.
 634                  String normalizedURL = DigestIndexer.stripURL(curi.toString());
 635 -                query = new TermQuery(new Term(
 636 -                            DigestIndexer.FIELD_URL_NORMALIZED,
 637 -                            normalizedURL));
 638 +                query = queryField(DigestIndexer.FIELD_URL_NORMALIZED,
 639 +                                normalizedURL);
 640                  hits = index.search(query);
 641                  for(int i=0 ; i<hits.length() ; i++){
 642                      String indexDigest = hits.doc(i).get(DigestIndexer.FIELD_DIGEST);
 643 @@ -650,7 +650,7 @@
 644          if (digest != null) {
 645              currentDigest = Base32.encode((byte[])digest);
 646          }
 647 -        Query query = new TermQuery(new Term(DigestIndexer.FIELD_DIGEST,currentDigest));
 648 +        Query query = queryField(DigestIndexer.FIELD_DIGEST, currentDigest);
 649          try {
 650              Hits hits = index.search(query);
 651              StringBuffer mirrors = new StringBuffer();
 652 @@ -832,8 +832,8 @@
 653  	protected void doAnalysis(CrawlURI curi, Statistics currHostStats,
 654              boolean isDuplicate) {
 655  		try{
 656 -    		Query query = new TermQuery(new Term(
 657 -                    DigestIndexer.FIELD_URL,curi.toString()));
 658 +    		Query query = queryField(DigestIndexer.FIELD_URL,
 659 +    		                         curi.toString());
 660      		Hits hits = index.search(query);
 661      		Document doc = null;
 662      		if(hits != null && hits.length() > 0){
 663 @@ -946,6 +946,20 @@
 664          
 665  	}
 666  
 667 +    /** Run a simple Lucene query for a single term in a single field.
 668 +     *
 669 +     * This method does not load norms, so its memory usage is less than
 670 +     * O(n) where n is the total number of entries in the index.
 671 +     *
 672 +     * @param fieldName name of the field to look in.
 673 +     * @param value The value to query for
 674 +     * @returns A Query for the given value in the given field.
 675 +     */
 676 +    protected Query queryField(String fieldName, String value) {
 677 +        return new ConstantScoreQuery(
 678 +                new SparseRangeFilter(fieldName, value, value, true, true));
 679 +    }
 680 +
 681  	protected void finalTasks() {
 682  		try {
 683  			index.close();

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2008-05-26 11:44:45, 2621.4 KB) [[attachment:deduplicator-0.3.0-20061218-patch-heritrix-1.12.1b.patch]]
  • [get | view] (2008-05-26 11:44:23, 0.7 KB) [[attachment:deduplicator-0.3.0-20061218-patch-index-NPE.patch]]
  • [get | view] (2008-05-26 11:44:30, 2.5 KB) [[attachment:deduplicator-0.3.0-20061218-patch-local-dateformat.patch]]
  • [get | view] (2008-05-27 08:54:28, 235.4 KB) [[attachment:deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory-2.patch]]
  • [get | view] (2008-05-26 11:44:14, 24.0 KB) [[attachment:deduplicator-0.3.0-20061218-patch-lucene-OutOfMemory.patch]]
  • [get | view] (2008-05-26 11:44:38, 2648.3 KB) [[attachment:deduplicator-0.3.0-20061218-patched-20080522-cumulative.patch]]
  • [get | view] (2008-05-26 11:45:11, 0.9 KB) [[attachment:deduplicator-0.3.0-20061218-patched-20080522.patch]]
  • [get | view] (2008-05-27 08:54:35, 2859.6 KB) [[attachment:deduplicator-0.3.0-20061218-patched-20080527-cumulative.patch]]
  • [get | view] (2008-05-27 08:55:25, 0.9 KB) [[attachment:deduplicator-0.3.0-20061218-patched-20080527.patch]]
  • [get | view] (2008-05-26 11:43:51, 1929.8 KB) [[attachment:deduplicator-0.3.0-20061218-src.zip]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.