Skip to content

Commit

Permalink
Try to add some delay to downloading as common-crawl S3 buckets are n…
Browse files Browse the repository at this point in the history
…ow heavily throttled
  • Loading branch information
centic9 committed Mar 29, 2024
1 parent 429c9ad commit c8658be
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ public static void main(String[] args) throws Exception {
File file = Utils.downloadFileFromCommonCrawl(client.getHttpClient(), item.url, item.getDocumentLocation(), true);
if (file != null) {
downloaded++;

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
} catch (IOException e) {
// skip files that we cannot store locally,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ protected static void handleInputStream(String url, InputStream stream, int inde
"linesPerSecond: " + linesPerSecond + ": " +
StringUtils.abbreviate(FOUND_MIME_TYPES.sortedMap().toString(), 95));
lastLog = System.currentTimeMillis();

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,15 @@ protected void handle(String url, byte[] block, int headerStart, long blockIndex
DocumentLocation header = DocumentLocation.readFromOldIndexBlock(block, headerStart);

Utils.downloadFileFromCommonCrawl(client.getHttpClient(), url, header, false);
}

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}

@Override
public void close() throws IOException {
Expand Down

0 comments on commit c8658be

Please sign in to comment.