diff --git a/src/main/java/org/dstadler/commoncrawl/Utils.java b/src/main/java/org/dstadler/commoncrawl/Utils.java index 12efad50..b1ad4473 100644 --- a/src/main/java/org/dstadler/commoncrawl/Utils.java +++ b/src/main/java/org/dstadler/commoncrawl/Utils.java @@ -31,6 +31,10 @@ public class Utils { private final static Logger log = LoggerFactory.make(); + // downloading from common-crawl S3 buckets is now heavily throttled, let's add some + // delay for each file to not hit the rate-limits very quickly + private static final int THROTTLE_DELAY_MS = 10_000; + // avoid having to read the header data always during testing, can be removed later... public static final int INDEX_BLOCK_COUNT = 2644; public static final int BLOCK_SIZE = 65536; @@ -214,4 +218,14 @@ public static void ensureDownloadDir() { } } } + + public static void throttleDownloads() { + // downloading from common-crawl S3 buckets is now heavily throttled, let's add some + // delay for each file to not hit the rate-limits very quickly + try { + Thread.sleep(THROTTLE_DELAY_MS); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } } diff --git a/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java b/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java index c2c0f6cf..d70b7633 100644 --- a/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java +++ b/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java @@ -50,13 +50,7 @@ public static void main(String[] args) throws Exception { if (file != null) { downloaded++; - // downloading from common-crawl S3 buckets is now heavily throttled, let's add some - // delay for each file to not hit the rate-limits very quickly - try { - Thread.sleep(10_000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } + Utils.throttleDownloads(); } } catch (IOException e) { // skip files that we cannot store locally, diff --git a/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java b/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java index 9cae99fc..7797f344 100644 --- a/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java +++ b/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java @@ -17,6 +17,7 @@ import org.archive.util.zip.GZIPMembersInputStream; import org.dstadler.commoncrawl.Extensions; import org.dstadler.commoncrawl.MimeTypes; +import org.dstadler.commoncrawl.Utils; import org.dstadler.commons.collections.MappedCounter; import org.dstadler.commons.collections.MappedCounterImpl; import org.dstadler.commons.http.HttpClientWrapper; @@ -143,13 +144,7 @@ protected static void handleInputStream(String url, InputStream stream, int inde StringUtils.abbreviate(FOUND_MIME_TYPES.sortedMap().toString(), 95)); lastLog = System.currentTimeMillis(); - // downloading from common-crawl S3 buckets is now heavily throttled, let's add some - // delay for each file to not hit the rate-limits very quickly - try { - Thread.sleep(10_000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } + Utils.throttleDownloads(); } } } diff --git a/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java b/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java index 499f569c..3a55b912 100644 --- a/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java +++ b/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java @@ -31,13 +31,7 @@ protected void handle(String url, byte[] block, int headerStart, long blockIndex Utils.downloadFileFromCommonCrawl(client.getHttpClient(), url, header, false); - // downloading from common-crawl S3 buckets is now heavily throttled, let's add some - // delay for each file to not hit the rate-limits very quickly - try { - Thread.sleep(10_000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } + Utils.throttleDownloads(); } @Override