Attachment 'default_orderxml.xml'
Download 1 <?xml version="1.0" encoding="UTF-8"?>
2 <crawl-order xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="heritrix_settings.xsd">
3 <meta>
4 <name>default_orderxml</name>
5 <description>Default Profile</description>
6 <operator>Admin</operator>
7 <organization/>
8 <audience></audience>
9 <date>20080118111217</date>
10 </meta>
11 <controller>
12 <string name="settings-directory">settings</string>
13 <string name="disk-path"/>
14 <string name="logs-path">logs</string>
15 <string name="checkpoints-path">checkpoints</string>
16 <string name="state-path">state</string>
17 <string name="scratch-path">scratch</string>
18 <long name="max-bytes-download">0</long>
19 <long name="max-document-download">0</long>
20 <long name="max-time-sec">0</long>
21 <integer name="max-toe-threads">50</integer>
22 <integer name="recorder-out-buffer-bytes">4096</integer>
23 <integer name="recorder-in-buffer-bytes">65536</integer>
24 <integer name="bdb-cache-percent">0</integer>
25 <!-- DecidingScope migrated from DomainScope -->
26 <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
27 <boolean name="enabled">true</boolean>
28 <string name="seedsfile">seeds.txt</string>
29 <boolean name="reread-seeds-on-config">true</boolean>
30 <!-- DecideRuleSequence. Multiple DecideRules applied in order with last non-PASS the resulting decision -->
31 <newObject name="decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
32
33 <map name="rules">
34 <newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule"/>
35 <newObject name="acceptURIFromSeedDomains" class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule">
36 <string name="decision">ACCEPT</string>
37 <string name="surts-source-file"></string>
38 <boolean name="seeds-as-surt-prefixes">true</boolean>
39 <string name="surts-dump-file"/>
40 <boolean name="also-check-via">false</boolean>
41 <boolean name="rebuild-on-reconfig">true</boolean>
42 </newObject>
43
44 <newObject name="rejectIfTooManyHops" class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
45 <integer name="max-hops">25</integer>
46 </newObject>
47 <newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
48 <integer name="max-repetitions">3</integer>
49 </newObject>
50 <newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">
51 <integer name="max-trans-hops">25</integer>
52 <integer name="max-speculative-hops">1</integer>
53 </newObject>
54 <newObject name="pathdepthfilter" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
55 <integer name="max-path-depth">20</integer>
56 </newObject>
57
58 <newObject name="global_crawlertraps" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule">
59 <string name="decision">REJECT</string>
60 <string name="list-logic">OR</string>
61 <stringList name="regexp-list">
62 <string>.*core\.UserAdmin.*core\.UserLogin.*</string>
63 <string>.*core\.UserAdmin.*register\.UserSelfRegistration.*</string>
64 <string>.*\/w\/index\.php\?title=Speci[ae]l:Recentchanges.*</string>
65 <string>.*act=calendar&cal_id=.*</string>
66 <string>.*advCalendar_pi.*</string>
67 <string>.*cal\.asp\?date=.*</string>
68 <string>.*cal\.asp\?view=monthly&date=.*</string>
69 <string>.*cal\.asp\?view=weekly&date=.*</string>
70 <string>.*cal\.asp\?view=yearly&date=.*</string>
71 <string>.*cal\.asp\?view=yearly&year=.*</string>
72 <string>.*cal\/cal_day\.php\?op=day&date=.*</string>
73 <string>.*cal\/cal_week\.php\?op=week&date=.*</string>
74 <string>.*cal\/calendar\.php\?op=cal&month=.*</string>
75 <string>.*cal\/yearcal\.php\?op=yearcal&ycyear=.*</string>
76 <string>.*calendar\.asp\?calmonth=.*</string>
77 <string>.*calendar\.asp\?qMonth=.*</string>
78 <string>.*calendar\.php\?sid=.*</string>
79 <string>.*calendar\.php\?start=.*</string>
80 <string>.*calendar\.php\?Y=.*</string>
81 <string>.*calendar\/\?CLmDemo_horizontal=.*</string>
82 <string>.*calendar_menu\/calendar\.php\?.*</string>
83 <string>.*calendar_scheduler\.php\?d=.*</string>
84 <string>.*calendar_year\.asp\?qYear=.*</string>
85 <string>.*calendarix\/calendar\.php\?op=.*</string>
86 <string>.*calendarix\/yearcal\.php\?op=.*</string>
87 <string>.*calender\/default\.asp\?month=.*</string>
88 <string>.*Default\.asp\?month=.*</string>
89 <string>.*events\.asp\?cat=0&mDate=.*</string>
90 <string>.*events\.asp\?cat=1&mDate=.*</string>
91 <string>.*events\.asp\?MONTH=.*</string>
92 <string>.*events\.asp\?month=.*</string>
93 <string>.*index\.php\?iDate=.*</string>
94 <string>.*index\.php\?module=PostCalendar&func=view.*</string>
95 <string>.*index\.php\?option=com_events&task=view.*</string>
96 <string>.*index\.php\?option=com_events&task=view_day&year=.*</string>
97 <string>.*index\.php\?option=com_events&task=view_detail&year=.*</string>
98 <string>.*index\.php\?option=com_events&task=view_month&year=.*</string>
99 <string>.*index\.php\?option=com_events&task=view_week&year=.*</string>
100 <string>.*index\.php\?option=com_events&task=view_year&year=.*</string>
101 <string>.*index\.php\?option=com_extcalendar&Itemid.*</string>
102 <string>.*modules\.php\?name=Calendar&op=modload&file=index.*</string>
103 <string>.*modules\.php\?name=vwar&file=calendar&action=list&month=.*</string>
104 <string>.*modules\.php\?name=vwar&file=calendar.*</string>
105 <string>.*modules\.php\?name=vWar&mod=calendar.*</string>
106 <string>.*modules\/piCal\/index\.php\?caldate=.*</string>
107 <string>.*modules\/piCal\/index\.php\?cid=.*</string>
108 <string>.*option,com_events\/task,view_day\/year.*</string>
109 <string>.*option,com_events\/task,view_month\/year.*</string>
110 <string>.*option,com_extcalendar\/Itemid.*</string>
111 <string>.*task,view_month\/year.*</string>
112 <string>.*shopping_cart\.php.*</string>
113 <string>.*action.add_product.*</string>
114 <string>.*action.remove_product.*</string>
115 <string>.*action.buy_now.*</string>
116 <string>.*checkout_payment\.php.*</string>
117 <string>.*login.*login.*login.*login.*</string>
118 <string>.*homepage_calendar\.asp.*</string>
119 <string>.*MediaWiki.*Movearticle.*</string>
120 <string>.*index\.php.*action=edit.*</string>
121 <string>.*comcast\.net.*othastar.*</string>
122 <string>.*Login.*Login.*Login.*</string>
123 <string>.*redir.*redir.*redir.*</string>
124 <string>.*bookingsystemtime\.asp\?dato=.*</string>
125 <string>.*bookingsystem\.asp\?date=.*</string>
126 <string>.*cart\.asp\?mode=add.*</string>
127 <string>.*\/photo.*\/photo.*\/photo.*</string>
128 <string>.*\/skins.*\/skins.*\/skins.*</string>
129 <string>.*\/scripts.*\/scripts.*\/scripts.*</string>
130 <string>.*\/styles.*\/styles.*\/styles.*</string>
131 <string>.*\/coppermine\/login\.php\?referer=.*</string>
132 <string>.*\/images.*\/images.*\/images.*</string>
133 <string>.*\/stories.*\/stories.*\/stories.*</string>
134 </stringList>
135 </newObject>
136
137 </map> <!-- end rules -->
138 </newObject> <!-- end decide-rules -->
139 </newObject> <!-- End DecidingScope -->
140 <map name="http-headers">
141 <string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.12.1 +http://my_website.com/my_infopage.html)</string>
142 <string name="from">my_email@my_website.com</string>
143 </map>
144 <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
145 <string name="type">ignore</string>
146 <boolean name="masquerade">false</boolean>
147 <text name="custom-robots"/>
148 <stringList name="user-agents">
149 </stringList>
150 </newObject>
151 <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">
152 <float name="delay-factor">1.0</float>
153 <integer name="max-delay-ms">1000</integer>
154 <integer name="min-delay-ms">300</integer>
155 <integer name="max-retries">3</integer>
156 <long name="retry-delay-seconds">300</long>
157 <integer name="preference-embed-hops">1</integer>
158 <integer name="total-bandwidth-usage-KB-sec">1500</integer>
159 <integer name="max-per-host-bandwidth-usage-KB-sec">500</integer>
160
161 <string name="queue-assignment-policy">dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy</string>
162
163 <string name="force-queue-assignment"/>
164 <boolean name="pause-at-start">false</boolean>
165 <boolean name="pause-at-finish">false</boolean>
166 <boolean name="source-tag-seeds">false</boolean>
167 <boolean name="recovery-log-enabled">false</boolean>
168 <boolean name="hold-queues">true</boolean>
169 <integer name="balance-replenish-amount">3000</integer>
170 <integer name="error-penalty-amount">100</integer>
171 <long name="queue-total-budget">-1</long>
172 <string name="cost-policy">org.archive.crawler.frontier.UnitCostAssignmentPolicy</string>
173 <long name="snooze-deactivate-ms">300000</long>
174 <integer name="target-ready-backlog">50</integer>
175 <string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
176 </newObject>
177
178 <map name="uri-canonicalization-rules">
179 <newObject name="Lowercase" class="org.archive.crawler.url.canonicalize.LowercaseRule">
180 <boolean name="enabled">true</boolean>
181 </newObject>
182 <newObject name="Userinfo" class="org.archive.crawler.url.canonicalize.StripUserinfoRule">
183 <boolean name="enabled">true</boolean>
184 </newObject>
185 <newObject name="WWW" class="org.archive.crawler.url.canonicalize.StripWWWRule">
186 <boolean name="enabled">false</boolean>
187 </newObject>
188 <newObject name="SessionIDs" class="org.archive.crawler.url.canonicalize.StripSessionIDs">
189 <boolean name="enabled">true</boolean>
190 </newObject>
191 <newObject name="QueryStrPrefix" class="org.archive.crawler.url.canonicalize.FixupQueryStr">
192 <boolean name="enabled">true</boolean>
193 </newObject>
194 </map>
195 <!-- Heritrix pre-fetch processors -->
196 <map name="pre-fetch-processors">
197
198 <newObject name="QuotaEnforcer" class="org.archive.crawler.prefetch.QuotaEnforcer">
199 <boolean name="force-retire">false</boolean>
200 <boolean name="enabled">true</boolean>
201 <newObject name="QuotaEnforcer#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
202 <map name="rules">
203 </map>
204 </newObject>
205 <long name="server-max-fetch-successes">-1</long>
206 <long name="server-max-success-kb">-1</long>
207 <long name="server-max-fetch-responses">-1</long>
208 <long name="server-max-all-kb">-1</long>
209
210 <long name="host-max-fetch-successes">-1</long>
211 <long name="host-max-success-kb">-1</long>
212 <long name="host-max-fetch-responses">-1</long>
213 <long name="host-max-all-kb">-1</long>
214
215 <long name="group-max-fetch-successes">-1</long>
216 <long name="group-max-success-kb">-1</long>
217 <long name="group-max-fetch-responses">-1</long>
218 <long name="group-max-all-kb">-1</long>
219
220 </newObject>
221
222 <newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">
223 <boolean name="enabled">true</boolean>
224 <newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
225 <map name="rules">
226 </map>
227 </newObject>
228 <boolean name="override-logger">false</boolean>
229 <boolean name="recheck-scope">true</boolean>
230 <boolean name="block-all">false</boolean>
231 <string name="block-by-regexp"/>
232 <string name="allow-by-regexp"/>
233 </newObject>
234 <newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">
235 <boolean name="enabled">true</boolean>
236 <newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
237 <map name="rules">
238 </map>
239 </newObject>
240 <integer name="ip-validity-duration-seconds">21600</integer>
241 <integer name="robot-validity-duration-seconds">86400</integer>
242 <boolean name="calculate-robots-only">false</boolean>
243 </newObject>
244 </map> <!--End of Heritrix pre-fetch processors -->
245 <!-- Heritrix fetch processors -->
246 <map name="fetch-processors">
247 <newObject name="DNS" class="org.archive.crawler.fetcher.FetchDNS">
248 <boolean name="enabled">true</boolean>
249 <newObject name="DNS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
250 <map name="rules">
251 </map>
252 </newObject>
253 <boolean name="accept-non-dns-resolves">false</boolean>
254 <boolean name="digest-content">true</boolean>
255 <string name="digest-algorithm">sha1</string>
256
257 </newObject>
258 <newObject name="HTTP" class="org.archive.crawler.fetcher.FetchHTTP">
259 <boolean name="enabled">true</boolean>
260 <newObject name="HTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
261 <map name="rules">
262 </map>
263 </newObject>
264 <newObject name="midfetch-decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
265 <map name="rules">
266 </map>
267 </newObject>
268 <integer name="timeout-seconds">1200</integer>
269 <integer name="sotimeout-ms">20000</integer>
270 <integer name="fetch-bandwidth">0</integer>
271 <long name="max-length-bytes">0</long>
272 <boolean name="ignore-cookies">false</boolean>
273 <boolean name="use-bdb-for-cookies">true</boolean>
274 <string name="load-cookies-from-file"/>
275 <string name="save-cookies-to-file"/>
276 <string name="trust-level">open</string>
277 <stringList name="accept-headers">
278 </stringList>
279 <string name="http-proxy-host"/>
280 <string name="http-proxy-port"/>
281 <string name="default-encoding">ISO-8859-1</string>
282 <boolean name="digest-content">true</boolean>
283 <string name="digest-algorithm">sha1</string>
284 <boolean name="send-if-modified-since">true</boolean>
285 <boolean name="send-if-none-match">true</boolean>
286 <boolean name="send-connection-close">true</boolean>
287 <boolean name="send-referer">true</boolean>
288 <boolean name="send-range">false</boolean>
289 <string name="bind-address"/>
290 </newObject>
291 </map> <!-- end of Heritrix Fetch processors -->
292
293 <!-- Heritrix extract processors -->
294 <map name="extract-processors">
295 <newObject name="ExtractorHTTP" class="org.archive.crawler.extractor.ExtractorHTTP">
296 <boolean name="enabled">true</boolean>
297 <newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
298 <map name="rules">
299 </map>
300 </newObject>
301 </newObject>
302 <newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML">
303 <boolean name="enabled">true</boolean>
304 <newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
305 <map name="rules">
306 </map>
307 </newObject>
308 <boolean name="extract-javascript">true</boolean>
309 <boolean name="treat-frames-as-embed-links">true</boolean>
310 <boolean name="ignore-form-action-urls">true</boolean>
311 <boolean name="overly-eager-link-detection">true</boolean>
312 <boolean name="ignore-unexpected-html">true</boolean>
313 </newObject>
314 <newObject name="ExtractorCSS" class="org.archive.crawler.extractor.ExtractorCSS">
315 <boolean name="enabled">true</boolean>
316 <newObject name="ExtractorCSS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
317 <map name="rules">
318 </map>
319 </newObject>
320 </newObject>
321 <newObject name="ExtractorJS" class="org.archive.crawler.extractor.ExtractorJS">
322 <boolean name="enabled">true</boolean>
323 <newObject name="ExtractorJS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
324 <map name="rules">
325 </map>
326 </newObject>
327 </newObject>
328 <newObject name="ExtractorSWF" class="org.archive.crawler.extractor.ExtractorSWF">
329 <boolean name="enabled">true</boolean>
330 <newObject name="ExtractorSWF#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
331 <map name="rules">
332 </map>
333 </newObject>
334 </newObject>
335 </map> <!-- end of Heritrix extract processors -->
336 <!-- Heritrix write processors -->
337 <map name="write-processors">
338 <newObject name="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">
339 <boolean name="enabled">true</boolean>
340 <map name="filters">
341 </map>
342 <string name="index-location"/>
343 <string name="matching-method">By URL</string>
344 <boolean name="try-equivalent">true</boolean>
345 <boolean name="change-content-size">false</boolean>
346 <string name="mime-filter">^text/.*</string>
347 <string name="filter-mode">Blacklist</string>
348 <string name="analysis-mode">Timestamp</string>
349 <string name="log-level">SEVERE</string>
350 <string name="origin"/>
351 <string name="origin-handling">Use index information</string>
352 <boolean name="stats-per-host">true</boolean>
353 </newObject>
354 <newObject name="Archiver" class="org.archive.crawler.writer.ARCWriterProcessor">
355 <boolean name="enabled">true</boolean>
356 <newObject name="Archiver#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
357 <map name="rules">
358 </map>
359 </newObject>
360 <boolean name="compress">false</boolean>
361 <string name="prefix">IAH</string>
362 <string name="suffix">${HOSTNAME}</string>
363 <integer name="max-size-bytes">100000000</integer>
364 <stringList name="path">
365 <string>arcs</string>
366 </stringList>
367 <integer name="pool-max-active">5</integer>
368 <integer name="pool-max-wait">300000</integer>
369 <long name="total-bytes-to-write">0</long>
370 <boolean name="skip-identical-digests">false</boolean>
371 </newObject>
372
373 </map> <!-- End of Heritrix write processors -->
374 <!-- Heritrix post processors -->
375 <map name="post-processors">
376 <newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater">
377 <boolean name="enabled">true</boolean>
378 <newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
379 <map name="rules">
380 </map>
381 </newObject>
382 </newObject>
383 <newObject name="LinksScoper" class="org.archive.crawler.postprocessor.LinksScoper">
384 <boolean name="enabled">true</boolean>
385 <newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
386 <map name="rules">
387 </map>
388 </newObject>
389 <boolean name="override-logger">false</boolean>
390 <boolean name="seed-redirects-new-seed">false</boolean>
391 <integer name="preference-depth-hops">-1</integer>
392
393 <newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
394 <map name="rules">
395 </map>
396 </newObject>
397 </newObject>
398
399 <newObject name="Scheduler" class="org.archive.crawler.postprocessor.FrontierScheduler">
400 <boolean name="enabled">true</boolean>
401 <newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
402 <map name="rules">
403 </map>
404 </newObject>
405 </newObject>
406
407 <newObject name="ContentSize" class="dk.netarkivet.harvester.harvesting.ContentSizeAnnotationPostProcessor">
408 <boolean name="enabled">true</boolean>
409 <newObject name="ContentSize#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
410 <map name="rules">
411 </map>
412 </newObject>
413 </newObject>
414
415 </map> <!-- end of Heritrix post processors -->
416
417 <map name="loggers">
418 <newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker">
419 <integer name="interval-seconds">20</integer>
420 </newObject>
421 </map>
422 <string name="recover-path"/>
423 <boolean name="checkpoint-copy-bdbje-logs">true</boolean>
424 <boolean name="recover-retain-failures">false</boolean>
425 <newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">
426 <map name="credentials">
427 </map>
428 </newObject>
429 </controller>
430 </crawl-order>
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.