Attachment 'default_orderxml.xml'

Download

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <crawl-order xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="heritrix_settings.xsd">
   3     <meta>
   4         <name>default_orderxml</name>
   5         <description>Default Profile</description>
   6         <operator>Admin</operator>
   7         <organization/>
   8         <audience></audience>
   9         <date>20080118111217</date>
  10     </meta>
  11     <controller>
  12         <string name="settings-directory">settings</string>
  13         <string name="disk-path"/>
  14         <string name="logs-path">logs</string>
  15         <string name="checkpoints-path">checkpoints</string>
  16         <string name="state-path">state</string>
  17         <string name="scratch-path">scratch</string>
  18         <long name="max-bytes-download">0</long>
  19         <long name="max-document-download">0</long>
  20         <long name="max-time-sec">0</long>
  21         <integer name="max-toe-threads">50</integer>
  22         <integer name="recorder-out-buffer-bytes">4096</integer>
  23         <integer name="recorder-in-buffer-bytes">65536</integer>
  24         <integer name="bdb-cache-percent">0</integer>
  25 	<!-- DecidingScope migrated from DomainScope -->
  26         <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
  27             <boolean name="enabled">true</boolean>
  28             <string name="seedsfile">seeds.txt</string>
  29             <boolean name="reread-seeds-on-config">true</boolean>
  30             <!-- DecideRuleSequence. Multiple DecideRules applied in order with last non-PASS the resulting decision -->
  31             <newObject name="decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
  32 
  33         	<map name="rules">
  34 			<newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule"/>
  35 			<newObject name="acceptURIFromSeedDomains" class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule">
  36             			<string name="decision">ACCEPT</string>
  37             			<string name="surts-source-file"></string>
  38             			<boolean name="seeds-as-surt-prefixes">true</boolean>
  39             			<string name="surts-dump-file"/>
  40             			<boolean name="also-check-via">false</boolean>
  41             			<boolean name="rebuild-on-reconfig">true</boolean>
  42           		</newObject>
  43 
  44             		<newObject name="rejectIfTooManyHops" class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
  45             			<integer name="max-hops">25</integer>
  46           		</newObject>
  47 			<newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
  48             			<integer name="max-repetitions">3</integer>
  49           		</newObject>
  50 			<newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">
  51             			<integer name="max-trans-hops">25</integer>
  52             			<integer name="max-speculative-hops">1</integer>
  53           		</newObject>
  54 			<newObject name="pathdepthfilter" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
  55             			<integer name="max-path-depth">20</integer>
  56           		</newObject>
  57 
  58 			<newObject name="global_crawlertraps" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule">
  59 			     <string name="decision">REJECT</string>
  60 			     <string name="list-logic">OR</string>
  61 			     <stringList name="regexp-list">
  62 			     <string>.*core\.UserAdmin.*core\.UserLogin.*</string>
  63 			     <string>.*core\.UserAdmin.*register\.UserSelfRegistration.*</string>
  64 			     <string>.*\/w\/index\.php\?title=Speci[ae]l:Recentchanges.*</string>
  65 			     <string>.*act=calendar&amp;cal_id=.*</string>
  66 			     <string>.*advCalendar_pi.*</string>
  67 			     <string>.*cal\.asp\?date=.*</string>
  68 			     <string>.*cal\.asp\?view=monthly&amp;date=.*</string>
  69 			     <string>.*cal\.asp\?view=weekly&amp;date=.*</string>
  70 			     <string>.*cal\.asp\?view=yearly&amp;date=.*</string>
  71 			     <string>.*cal\.asp\?view=yearly&amp;year=.*</string>
  72 			     <string>.*cal\/cal_day\.php\?op=day&amp;date=.*</string>
  73 			     <string>.*cal\/cal_week\.php\?op=week&amp;date=.*</string>
  74 			     <string>.*cal\/calendar\.php\?op=cal&amp;month=.*</string>
  75 			     <string>.*cal\/yearcal\.php\?op=yearcal&amp;ycyear=.*</string>
  76 			     <string>.*calendar\.asp\?calmonth=.*</string>
  77 			     <string>.*calendar\.asp\?qMonth=.*</string>
  78 			     <string>.*calendar\.php\?sid=.*</string>
  79 			     <string>.*calendar\.php\?start=.*</string>
  80 			     <string>.*calendar\.php\?Y=.*</string>
  81 			     <string>.*calendar\/\?CLmDemo_horizontal=.*</string>
  82 			     <string>.*calendar_menu\/calendar\.php\?.*</string>
  83 			     <string>.*calendar_scheduler\.php\?d=.*</string>
  84 			     <string>.*calendar_year\.asp\?qYear=.*</string>
  85 			     <string>.*calendarix\/calendar\.php\?op=.*</string>
  86 			     <string>.*calendarix\/yearcal\.php\?op=.*</string>
  87 			     <string>.*calender\/default\.asp\?month=.*</string>
  88 			     <string>.*Default\.asp\?month=.*</string>
  89 			     <string>.*events\.asp\?cat=0&amp;mDate=.*</string>
  90 			     <string>.*events\.asp\?cat=1&amp;mDate=.*</string>
  91 			     <string>.*events\.asp\?MONTH=.*</string>
  92 			     <string>.*events\.asp\?month=.*</string>
  93 			     <string>.*index\.php\?iDate=.*</string>
  94 			     <string>.*index\.php\?module=PostCalendar&amp;func=view.*</string>
  95 			     <string>.*index\.php\?option=com_events&amp;task=view.*</string>
  96 			     <string>.*index\.php\?option=com_events&amp;task=view_day&amp;year=.*</string>
  97 			     <string>.*index\.php\?option=com_events&amp;task=view_detail&amp;year=.*</string>
  98 			     <string>.*index\.php\?option=com_events&amp;task=view_month&amp;year=.*</string>
  99 			     <string>.*index\.php\?option=com_events&amp;task=view_week&amp;year=.*</string>
 100                             <string>.*index\.php\?option=com_events&amp;task=view_year&amp;year=.*</string>
 101                             <string>.*index\.php\?option=com_extcalendar&amp;Itemid.*</string>
 102                             <string>.*modules\.php\?name=Calendar&amp;op=modload&amp;file=index.*</string>
 103                             <string>.*modules\.php\?name=vwar&amp;file=calendar&amp;action=list&amp;month=.*</string>
 104                             <string>.*modules\.php\?name=vwar&amp;file=calendar.*</string>
 105                             <string>.*modules\.php\?name=vWar&amp;mod=calendar.*</string>
 106                             <string>.*modules\/piCal\/index\.php\?caldate=.*</string>
 107                             <string>.*modules\/piCal\/index\.php\?cid=.*</string>
 108                             <string>.*option,com_events\/task,view_day\/year.*</string>
 109                             <string>.*option,com_events\/task,view_month\/year.*</string>
 110                             <string>.*option,com_extcalendar\/Itemid.*</string>
 111                             <string>.*task,view_month\/year.*</string>
 112                             <string>.*shopping_cart\.php.*</string>
 113                             <string>.*action.add_product.*</string>
 114                             <string>.*action.remove_product.*</string>
 115                             <string>.*action.buy_now.*</string>
 116                             <string>.*checkout_payment\.php.*</string>
 117                             <string>.*login.*login.*login.*login.*</string>
 118                             <string>.*homepage_calendar\.asp.*</string>
 119                             <string>.*MediaWiki.*Movearticle.*</string>
 120                             <string>.*index\.php.*action=edit.*</string>
 121                             <string>.*comcast\.net.*othastar.*</string>
 122                             <string>.*Login.*Login.*Login.*</string>
 123                             <string>.*redir.*redir.*redir.*</string>
 124                             <string>.*bookingsystemtime\.asp\?dato=.*</string>
 125                             <string>.*bookingsystem\.asp\?date=.*</string>
 126                             <string>.*cart\.asp\?mode=add.*</string>
 127                             <string>.*\/photo.*\/photo.*\/photo.*</string>
 128                             <string>.*\/skins.*\/skins.*\/skins.*</string>
 129                             <string>.*\/scripts.*\/scripts.*\/scripts.*</string>
 130                             <string>.*\/styles.*\/styles.*\/styles.*</string>
 131                             <string>.*\/coppermine\/login\.php\?referer=.*</string>
 132                             <string>.*\/images.*\/images.*\/images.*</string>
 133                             <string>.*\/stories.*\/stories.*\/stories.*</string>
 134                         </stringList>
 135                     </newObject>
 136 
 137 		</map> <!-- end rules -->
 138             </newObject> <!-- end decide-rules -->
 139         </newObject> <!-- End DecidingScope -->
 140         <map name="http-headers">
 141             <string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.12.1 +http://my_website.com/my_infopage.html)</string>
 142             <string name="from">my_email@my_website.com</string>
 143         </map>
 144         <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
 145             <string name="type">ignore</string>
 146             <boolean name="masquerade">false</boolean>
 147             <text name="custom-robots"/>
 148             <stringList name="user-agents">
 149             </stringList>
 150         </newObject>
 151         <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">
 152             <float name="delay-factor">1.0</float>
 153             <integer name="max-delay-ms">1000</integer>
 154             <integer name="min-delay-ms">300</integer>
 155             <integer name="max-retries">3</integer>
 156             <long name="retry-delay-seconds">300</long>
 157             <integer name="preference-embed-hops">1</integer>
 158             <integer name="total-bandwidth-usage-KB-sec">1500</integer>
 159             <integer name="max-per-host-bandwidth-usage-KB-sec">500</integer>
 160             
 161         <string name="queue-assignment-policy">dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy</string>
 162     
 163             <string name="force-queue-assignment"/>
 164             <boolean name="pause-at-start">false</boolean>
 165             <boolean name="pause-at-finish">false</boolean>
 166             <boolean name="source-tag-seeds">false</boolean>
 167             <boolean name="recovery-log-enabled">false</boolean>
 168             <boolean name="hold-queues">true</boolean>
 169             <integer name="balance-replenish-amount">3000</integer>
 170             <integer name="error-penalty-amount">100</integer>
 171             <long name="queue-total-budget">-1</long>
 172             <string name="cost-policy">org.archive.crawler.frontier.UnitCostAssignmentPolicy</string>
 173             <long name="snooze-deactivate-ms">300000</long>
 174 	    <integer name="target-ready-backlog">50</integer>	
 175             <string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
 176         </newObject>
 177 
 178         <map name="uri-canonicalization-rules">
 179             <newObject name="Lowercase" class="org.archive.crawler.url.canonicalize.LowercaseRule">
 180                 <boolean name="enabled">true</boolean>
 181             </newObject>
 182             <newObject name="Userinfo" class="org.archive.crawler.url.canonicalize.StripUserinfoRule">
 183                 <boolean name="enabled">true</boolean>
 184             </newObject>
 185             <newObject name="WWW" class="org.archive.crawler.url.canonicalize.StripWWWRule">
 186                 <boolean name="enabled">false</boolean>
 187             </newObject>
 188             <newObject name="SessionIDs" class="org.archive.crawler.url.canonicalize.StripSessionIDs">
 189                 <boolean name="enabled">true</boolean>
 190             </newObject>
 191             <newObject name="QueryStrPrefix" class="org.archive.crawler.url.canonicalize.FixupQueryStr">
 192                 <boolean name="enabled">true</boolean>
 193             </newObject>
 194         </map>
 195         <!-- Heritrix pre-fetch processors -->
 196         <map name="pre-fetch-processors">
 197 
 198             <newObject name="QuotaEnforcer" class="org.archive.crawler.prefetch.QuotaEnforcer">
 199                 <boolean name="force-retire">false</boolean>
 200     		<boolean name="enabled">true</boolean>
 201 		<newObject name="QuotaEnforcer#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 202           		<map name="rules">
 203           		</map>
 204 		</newObject>
 205               <long name="server-max-fetch-successes">-1</long>
 206               <long name="server-max-success-kb">-1</long>
 207               <long name="server-max-fetch-responses">-1</long>
 208               <long name="server-max-all-kb">-1</long>
 209 
 210               <long name="host-max-fetch-successes">-1</long>
 211               <long name="host-max-success-kb">-1</long>
 212               <long name="host-max-fetch-responses">-1</long>
 213               <long name="host-max-all-kb">-1</long>
 214 
 215               <long name="group-max-fetch-successes">-1</long>
 216               <long name="group-max-success-kb">-1</long>
 217               <long name="group-max-fetch-responses">-1</long>
 218               <long name="group-max-all-kb">-1</long>
 219 
 220             </newObject>
 221 
 222             <newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">
 223                 <boolean name="enabled">true</boolean>
 224 		<newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 225           		<map name="rules">
 226           		</map>
 227 		</newObject>
 228                 <boolean name="override-logger">false</boolean>
 229                 <boolean name="recheck-scope">true</boolean>
 230                 <boolean name="block-all">false</boolean>
 231                 <string name="block-by-regexp"/>
 232 		<string name="allow-by-regexp"/>
 233             </newObject>
 234             <newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">
 235                 <boolean name="enabled">true</boolean>
 236 		<newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 237           		<map name="rules">
 238           		</map>
 239 		</newObject>
 240                 <integer name="ip-validity-duration-seconds">21600</integer>
 241                 <integer name="robot-validity-duration-seconds">86400</integer>
 242                 <boolean name="calculate-robots-only">false</boolean>
 243             </newObject>
 244         </map> <!--End of Heritrix pre-fetch processors -->
 245         <!-- Heritrix fetch processors -->
 246         <map name="fetch-processors">
 247             <newObject name="DNS" class="org.archive.crawler.fetcher.FetchDNS">
 248                 <boolean name="enabled">true</boolean>
 249 		<newObject name="DNS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 250           		<map name="rules">
 251           		</map>
 252 		</newObject>
 253                 <boolean name="accept-non-dns-resolves">false</boolean>
 254 		<boolean name="digest-content">true</boolean>
 255 		<string name="digest-algorithm">sha1</string>
 256 
 257             </newObject>
 258             <newObject name="HTTP" class="org.archive.crawler.fetcher.FetchHTTP">
 259                 <boolean name="enabled">true</boolean>
 260                 <newObject name="HTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 261           		<map name="rules">
 262           		</map>
 263 		</newObject>
 264                 <newObject name="midfetch-decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 265 			<map name="rules">
 266           		</map>	
 267 		</newObject>
 268 		<integer name="timeout-seconds">1200</integer>
 269                 <integer name="sotimeout-ms">20000</integer>
 270                 <integer name="fetch-bandwidth">0</integer>
 271                 <long name="max-length-bytes">0</long>
 272                 <boolean name="ignore-cookies">false</boolean>
 273                 <boolean name="use-bdb-for-cookies">true</boolean>
 274                 <string name="load-cookies-from-file"/>
 275                 <string name="save-cookies-to-file"/>
 276                 <string name="trust-level">open</string>
 277                 <stringList name="accept-headers">
 278                 </stringList>
 279                 <string name="http-proxy-host"/>
 280                 <string name="http-proxy-port"/>
 281                 <string name="default-encoding">ISO-8859-1</string>
 282 		<boolean name="digest-content">true</boolean>
 283 		<string name="digest-algorithm">sha1</string>
 284 		<boolean name="send-if-modified-since">true</boolean>
 285 		<boolean name="send-if-none-match">true</boolean>
 286                 <boolean name="send-connection-close">true</boolean>
 287                 <boolean name="send-referer">true</boolean>
 288                 <boolean name="send-range">false</boolean>
 289 		<string name="bind-address"/>
 290     </newObject>
 291         </map> <!-- end of Heritrix Fetch processors -->
 292                
 293         <!-- Heritrix extract processors -->
 294         <map name="extract-processors">
 295             <newObject name="ExtractorHTTP" class="org.archive.crawler.extractor.ExtractorHTTP">
 296                 <boolean name="enabled">true</boolean>
 297 		<newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 298           		<map name="rules">
 299           		</map>
 300 		</newObject>
 301             </newObject>
 302             <newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML">
 303                 <boolean name="enabled">true</boolean>
 304 		<newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 305           		<map name="rules">
 306           		</map>
 307 		</newObject>
 308 		<boolean name="extract-javascript">true</boolean>
 309 		<boolean name="treat-frames-as-embed-links">true</boolean>
 310 		<boolean name="ignore-form-action-urls">true</boolean>
 311 		<boolean name="overly-eager-link-detection">true</boolean>
 312 		<boolean name="ignore-unexpected-html">true</boolean>
 313             </newObject>
 314             <newObject name="ExtractorCSS" class="org.archive.crawler.extractor.ExtractorCSS">
 315                 <boolean name="enabled">true</boolean>
 316 		<newObject name="ExtractorCSS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 317           		<map name="rules">
 318           		</map>
 319 		</newObject>
 320             </newObject>
 321             <newObject name="ExtractorJS" class="org.archive.crawler.extractor.ExtractorJS">
 322                 <boolean name="enabled">true</boolean>
 323 		<newObject name="ExtractorJS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 324           		<map name="rules">
 325           		</map>
 326 		</newObject>
 327 	    </newObject>
 328             <newObject name="ExtractorSWF" class="org.archive.crawler.extractor.ExtractorSWF">
 329                 <boolean name="enabled">true</boolean>
 330 		<newObject name="ExtractorSWF#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 331           		<map name="rules">
 332           		</map>
 333 		</newObject>
 334 	     </newObject>
 335         </map> <!-- end of Heritrix extract processors -->
 336         <!-- Heritrix write processors -->  
 337         <map name="write-processors">
 338             <newObject name="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">
 339         	<boolean name="enabled">true</boolean>
 340         	<map name="filters">
 341         	</map>
 342         	<string name="index-location"/>
 343         	<string name="matching-method">By URL</string>
 344         	<boolean name="try-equivalent">true</boolean>
 345         	<boolean name="change-content-size">false</boolean>
 346         	<string name="mime-filter">^text/.*</string>
 347         	<string name="filter-mode">Blacklist</string>
 348         	<string name="analysis-mode">Timestamp</string>
 349         	<string name="log-level">SEVERE</string>
 350         	<string name="origin"/>
 351         	<string name="origin-handling">Use index information</string>
 352         	<boolean name="stats-per-host">true</boolean>
 353     	   </newObject>
 354     	<newObject name="Archiver" class="org.archive.crawler.writer.ARCWriterProcessor">
 355                 <boolean name="enabled">true</boolean>
 356 		<newObject name="Archiver#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 357           		<map name="rules">
 358           		</map>
 359 		</newObject>
 360 		<boolean name="compress">false</boolean>
 361                 <string name="prefix">IAH</string>
 362                 <string name="suffix">${HOSTNAME}</string>
 363                 <integer name="max-size-bytes">100000000</integer>
 364                 <stringList name="path">
 365                     <string>arcs</string>
 366                 </stringList>
 367                 <integer name="pool-max-active">5</integer>
 368                 <integer name="pool-max-wait">300000</integer>
 369                 <long name="total-bytes-to-write">0</long>
 370 		<boolean name="skip-identical-digests">false</boolean>
 371     </newObject>
 372 
 373         </map> <!-- End of Heritrix write processors -->
 374         <!-- Heritrix post processors --> 
 375         <map name="post-processors">
 376             <newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater">
 377                 <boolean name="enabled">true</boolean>
 378 		<newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 379           		<map name="rules">
 380           		</map>
 381 		</newObject>
 382             </newObject>
 383             <newObject name="LinksScoper" class="org.archive.crawler.postprocessor.LinksScoper">
 384                 <boolean name="enabled">true</boolean>
 385 		<newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 386           		<map name="rules">
 387           		</map>
 388 		</newObject>
 389 		<boolean name="override-logger">false</boolean>
 390                 <boolean name="seed-redirects-new-seed">false</boolean>
 391 		<integer name="preference-depth-hops">-1</integer>
 392 
 393 		<newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 394 			<map name="rules">
 395           		</map>
 396 		</newObject>
 397             </newObject>
 398             
 399     	<newObject name="Scheduler" class="org.archive.crawler.postprocessor.FrontierScheduler">
 400                 <boolean name="enabled">true</boolean>
 401 		<newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 402           		<map name="rules">
 403           		</map>
 404 		</newObject>
 405 	</newObject>
 406     
 407     	<newObject name="ContentSize" class="dk.netarkivet.harvester.harvesting.ContentSizeAnnotationPostProcessor">
 408         	<boolean name="enabled">true</boolean>
 409 		<newObject name="ContentSize#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
 410           		<map name="rules">
 411           		</map>
 412 		</newObject>
 413     	</newObject>
 414     
 415         </map>  <!-- end of Heritrix post processors --> 
 416 
 417         <map name="loggers">
 418             <newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker">
 419                 <integer name="interval-seconds">20</integer>
 420             </newObject>
 421         </map>
 422         <string name="recover-path"/>
 423         <boolean name="checkpoint-copy-bdbje-logs">true</boolean>
 424         <boolean name="recover-retain-failures">false</boolean>
 425         <newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">
 426             <map name="credentials">
 427             </map>
 428         </newObject>
 429     </controller>
 430 </crawl-order>

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2010-02-08 16:53:14, 0.8 KB) [[attachment:HarvestTemplateApplication.sh.txt]]
  • [get | view] (2009-11-17 08:45:59, 22.9 KB) [[attachment:default_orderxml.xml]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.