diff --git a/cronfile b/cronfile index 74a629a..f78cac3 100644 --- a/cronfile +++ b/cronfile @@ -14,7 +14,6 @@ SHELL=/bin/bash ### PLACE ALL CRON TASKS BELOW -# removes unresponsive users from the subscriber list to decrease bounce rates -23 2 * * 0 dokku dokku --rm enter findthatcharity_scrape sh ./scrape_all.sh +23 2 * * 0 dokku dokku --rm enter ftc-scrapers sh ./crawl_all.sh ### PLACE ALL CRON TASKS ABOVE, DO NOT REMOVE THE WHITESPACE AFTER THIS LINE diff --git a/findthatcharity_import/settings.py b/findthatcharity_import/settings.py index 88c0634..cad6be9 100644 --- a/findthatcharity_import/settings.py +++ b/findthatcharity_import/settings.py @@ -94,7 +94,7 @@ # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True -HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 * 7 # one week +HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 3 # three hours HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/readme.md b/readme.md index 5281143..69663e8 100644 --- a/readme.md +++ b/readme.md @@ -190,8 +190,8 @@ git push dokku master ## Other settings -By default, the `HTTPCACHE` extension is enabled, with resources cached for one week. -This means that any data downloaded or websites visited are cached for one week to prevent +By default, the `HTTPCACHE` extension is enabled, with resources cached for three hours. +This means that any data downloaded or websites visited are cached for three hours to prevent overload of the sites. This means it is relatively risk-free to rerun scraping after adjusting other settings for e.g. saving to a database. These settings can be changed if needed.