Skip to content

Commit

Permalink
Use a common method for throttling to make it easy to adjust in the f…
Browse files Browse the repository at this point in the history
…uture
  • Loading branch information
centic9 committed Mar 29, 2024
1 parent c8658be commit d7c2a48
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 21 deletions.
14 changes: 14 additions & 0 deletions src/main/java/org/dstadler/commoncrawl/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
public class Utils {
private final static Logger log = LoggerFactory.make();

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
private static final int THROTTLE_DELAY_MS = 10_000;

// avoid having to read the header data always during testing, can be removed later...
public static final int INDEX_BLOCK_COUNT = 2644;
public static final int BLOCK_SIZE = 65536;
Expand Down Expand Up @@ -214,4 +218,14 @@ public static void ensureDownloadDir() {
}
}
}

public static void throttleDownloads() {
// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(THROTTLE_DELAY_MS);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,7 @@ public static void main(String[] args) throws Exception {
if (file != null) {
downloaded++;

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
Utils.throttleDownloads();
}
} catch (IOException e) {
// skip files that we cannot store locally,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.archive.util.zip.GZIPMembersInputStream;
import org.dstadler.commoncrawl.Extensions;
import org.dstadler.commoncrawl.MimeTypes;
import org.dstadler.commoncrawl.Utils;
import org.dstadler.commons.collections.MappedCounter;
import org.dstadler.commons.collections.MappedCounterImpl;
import org.dstadler.commons.http.HttpClientWrapper;
Expand Down Expand Up @@ -143,13 +144,7 @@ protected static void handleInputStream(String url, InputStream stream, int inde
StringUtils.abbreviate(FOUND_MIME_TYPES.sortedMap().toString(), 95));
lastLog = System.currentTimeMillis();

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
Utils.throttleDownloads();
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,7 @@ protected void handle(String url, byte[] block, int headerStart, long blockIndex

Utils.downloadFileFromCommonCrawl(client.getHttpClient(), url, header, false);

// downloading from common-crawl S3 buckets is now heavily throttled, let's add some
// delay for each file to not hit the rate-limits very quickly
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
Utils.throttleDownloads();
}

@Override
Expand Down

0 comments on commit d7c2a48

Please sign in to comment.