diff --git a/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java b/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java index 3a6f091b..c2c0f6cf 100644 --- a/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java +++ b/src/main/java/org/dstadler/commoncrawl/index/DownloadFromCommonCrawl.java @@ -49,6 +49,14 @@ public static void main(String[] args) throws Exception { File file = Utils.downloadFileFromCommonCrawl(client.getHttpClient(), item.url, item.getDocumentLocation(), true); if (file != null) { downloaded++; + + // downloading from common-crawl S3 buckets is now heavily throttled, let's add some + // delay for each file to not hit the rate-limits very quickly + try { + Thread.sleep(10_000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } } } catch (IOException e) { // skip files that we cannot store locally, diff --git a/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java b/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java index a4c4d2a1..9cae99fc 100644 --- a/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java +++ b/src/main/java/org/dstadler/commoncrawl/index/DownloadURLIndex.java @@ -142,6 +142,14 @@ protected static void handleInputStream(String url, InputStream stream, int inde "linesPerSecond: " + linesPerSecond + ": " + StringUtils.abbreviate(FOUND_MIME_TYPES.sortedMap().toString(), 95)); lastLog = System.currentTimeMillis(); + + // downloading from common-crawl S3 buckets is now heavily throttled, let's add some + // delay for each file to not hit the rate-limits very quickly + try { + Thread.sleep(10_000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } } } } diff --git a/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java b/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java index 1fcb2063..499f569c 100644 --- a/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java +++ b/src/main/java/org/dstadler/commoncrawl/oldindex/ProcessAndDownload.java @@ -30,7 +30,15 @@ protected void handle(String url, byte[] block, int headerStart, long blockIndex DocumentLocation header = DocumentLocation.readFromOldIndexBlock(block, headerStart); Utils.downloadFileFromCommonCrawl(client.getHttpClient(), url, header, false); - } + + // downloading from common-crawl S3 buckets is now heavily throttled, let's add some + // delay for each file to not hit the rate-limits very quickly + try { + Thread.sleep(10_000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } @Override public void close() throws IOException {