Skip to content

Commit

Permalink
Update to Tika 3.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Oct 22, 2024
1 parent f2565fb commit 767653a
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 14 deletions.
3 changes: 2 additions & 1 deletion src/main/java/org/tallison/cc/index/IndexIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.fetcher.FetchKey;
Expand Down Expand Up @@ -90,7 +91,7 @@ public IndexIterator(@JsonProperty("profile") String profile,
private static void addIndexPaths(Fetcher fetcher, String path, List<String> indexPaths)
throws IOException, TikaException {

try (InputStream is = fetcher.fetch(path, new Metadata())) {
try (InputStream is = fetcher.fetch(path, new Metadata(), new ParseContext())) {
try (BufferedReader reader = getReader(is, path)) {
String line = reader.readLine();
while (line != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.pipesiterator.CallablePipesIterator;
Expand Down Expand Up @@ -170,7 +171,7 @@ private boolean processFile(FetchEmitTuple fetchEmitTuple,
LOGGER.info("starting to fetch index gz: {}",
fetchEmitTuple.getFetchKey().getFetchKey());
try (TikaInputStream tis = (TikaInputStream) indexFetcher.fetch(
fetchEmitTuple.getFetchKey().getFetchKey(), new Metadata())) {
fetchEmitTuple.getFetchKey().getFetchKey(), new Metadata(), new ParseContext())) {
try (InputStream is = new BufferedInputStream(new GZIPInputStream(tis))) {
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(is, StandardCharsets.UTF_8))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.emitter.StreamEmitter;
import org.apache.tika.pipes.fetcher.Fetcher;
Expand Down Expand Up @@ -132,8 +133,8 @@ public Long call() throws Exception {
private void fetch(FetchEmitTuple t, Fetcher fetcher, StreamEmitter streamEmitter) {

LOGGER.info("about to download: " + t.getFetchKey().getFetchKey());
try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), new Metadata())) {
streamEmitter.emit(t.getFetchKey().getFetchKey(), is, new Metadata());
try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), new Metadata(), new ParseContext())) {
streamEmitter.emit(t.getFetchKey().getFetchKey(), is, new Metadata(), new ParseContext());
LOGGER.info("successfully downloaded: " + t.getFetchKey().getFetchKey());
} catch (TikaException | IOException e) {
LOGGER.error("failed to copy " + t.getFetchKey().getFetchKey(), e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.pipesiterator.CallablePipesIterator;
Expand Down Expand Up @@ -239,7 +240,7 @@ private boolean processFile(FetchEmitTuple fetchEmitTuple,
.getFetchKey(), fetcher.getClass());
try (TikaInputStream tis = (TikaInputStream) fetcher.fetch(fetchEmitTuple
.getFetchKey()
.getFetchKey(), new Metadata())) {
.getFetchKey(), new Metadata(), new ParseContext())) {
try (InputStream is = new BufferedInputStream(new GZIPInputStream(tis))) {
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(is, StandardCharsets.UTF_8))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,7 @@ Fetcher newFetcher() throws TikaConfigException {
((S3Fetcher) fetcher).setCredentialsProvider("profile");
((S3Fetcher) fetcher).setBucket(ExtractorConfig.CC_S3_BUCKET);
((S3Fetcher) fetcher).setRegion(ExtractorConfig.CC_REGION);
//Update and make configurable once TIKA-3993 is fixed
((S3Fetcher) fetcher).setRetries(3);
((S3Fetcher) fetcher).setSleepBeforeRetryMillis(30000);
((S3Fetcher) fetcher).setThrottleSeconds(throttleSeconds);
} else if (basePath != null) {
fetcher = new FileSystemFetcher();
((FileSystemFetcher) fetcher).setBasePath(basePath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.emitter.StreamEmitter;
Expand Down Expand Up @@ -147,7 +148,7 @@ private void fetchPayload(String id, CCIndexRecord ccIndexRecord, WarcRecord rec
String targetPath = targetPathRewriter.rewrite(targetDigest);
Metadata metadata = new Metadata();
try (InputStream is = TikaInputStream.get(tmp, metadata)) {
emitter.emit(targetPath, is, new Metadata());
emitter.emit(targetPath, is, new Metadata(), new ParseContext());
logSuccess(ccIndexRecord, targetDigest, length, targetPath);
} catch (IOException | TikaException e) {
LOGGER.warn("problem writing id={}", id, e);
Expand Down
13 changes: 8 additions & 5 deletions src/main/java/org/tallison/cc/index/io/BackoffHttpFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.http.HttpFetcher;

Expand All @@ -47,15 +48,16 @@ public BackoffHttpFetcher(long[] throttleSeconds) {
}

@Override
public InputStream fetch(String fetchKey, Metadata metadata) throws TikaException, IOException {
return fetchWithBackOff(new FetchKey("name", getUrl(fetchKey)), metadata);
public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext)
throws TikaException, IOException {
return fetchWithBackOff(new FetchKey("name", getUrl(fetchKey)), metadata, parseContext);
}

@Override
public InputStream fetch(String fetchkey, long rangeStart, long rangeEnd, Metadata metadata)
throws IOException {
return fetchWithBackOff(new FetchKey("name", getUrl(fetchkey), rangeStart, rangeEnd),
metadata);
metadata, new ParseContext());
}

private String getUrl(String fetchKey) {
Expand All @@ -69,7 +71,8 @@ private String getUrl(String fetchKey) {
return fetchKey;
}

private InputStream fetchWithBackOff(FetchKey fetchKey, Metadata metadata) throws IOException {
private InputStream fetchWithBackOff(FetchKey fetchKey, Metadata metadata,
ParseContext parseContext) throws IOException {
int tries = 0;
while (tries < throttleSeconds.length) {
try {
Expand Down Expand Up @@ -109,7 +112,7 @@ private TikaInputStream _fetch(FetchKey fetchKey, Metadata metadata)
return (TikaInputStream) super.fetch(fetchKey.getFetchKey(), fetchKey.getRangeStart(),
fetchKey.getRangeEnd(), metadata);
} else {
return (TikaInputStream) super.fetch(fetchKey.getFetchKey(), metadata);
return (TikaInputStream) super.fetch(fetchKey.getFetchKey(), metadata, new ParseContext());
}

}
Expand Down

0 comments on commit 767653a

Please sign in to comment.