From 9315d66523bdeeaa4d54f746313e8939ed82ae06 Mon Sep 17 00:00:00 2001 From: tallison Date: Fri, 18 Oct 2024 09:52:43 -0400 Subject: [PATCH] This fixes issue #48 --- pom.xml | 21 +++---- .../cc/index/extractor/CCMimeCounter.java | 2 +- .../cc/index/selector/ExtensionsSelector.java | 57 +++++++++++++++++++ .../cc/index/selector/SelectorClause.java | 4 +- .../selector/IndexRecordSelectorTest.java | 21 +++---- src/test/resources/selectors/basic.json | 7 ++- src/test/resources/selectors/extensions.json | 16 ++++++ 7 files changed, 105 insertions(+), 23 deletions(-) create mode 100644 src/main/java/org/tallison/cc/index/selector/ExtensionsSelector.java create mode 100644 src/test/resources/selectors/extensions.json diff --git a/pom.xml b/pom.xml index f679f19..c710fcf 100644 --- a/pom.xml +++ b/pom.xml @@ -28,15 +28,16 @@ 4.4.14 2.18.0 5.11.2 + 2.24.1 2.9.2 - 3.4.1 + 3.6.0 commons-logging commons-logging - 1.2 + 1.3.4 org.apache.httpcomponents @@ -76,7 +77,7 @@ org.apache.commons commons-compress - 1.23.0 + 1.27.1 commons-io @@ -142,12 +143,12 @@ org.apache.logging.log4j log4j-core - 2.20.0 + ${log4j2.version} org.apache.logging.log4j log4j-slf4j2-impl - 2.20.0 + ${log4j2.version} org.apache.commons @@ -201,7 +202,7 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.2.2 + 3.5.0 com.puppycrawl.tools @@ -232,7 +233,7 @@ de.thetaphi forbiddenapis - 3.5.1 + 3.8 ${maven.compiler.target} true @@ -263,7 +264,7 @@ org.codehaus.mojo versions-maven-plugin - 2.16.0 + 2.17.1 false @@ -271,7 +272,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.3.0 + 3.5.0 enforce-maven @@ -279,7 +280,7 @@ - 3.5 + 3.7 diff --git a/src/main/java/org/tallison/cc/index/extractor/CCMimeCounter.java b/src/main/java/org/tallison/cc/index/extractor/CCMimeCounter.java index 690d179..2ffe6e2 100644 --- a/src/main/java/org/tallison/cc/index/extractor/CCMimeCounter.java +++ b/src/main/java/org/tallison/cc/index/extractor/CCMimeCounter.java @@ -207,7 +207,7 @@ private static class IndexWorker implements Callable { AbstractRecordProcessor recordProcessor) throws TikaException { this.indexUrls = indexUrls; this.recordProcessor = recordProcessor; - this.fetcher = fetcherConfig.newFetcher(); + this.fetcher = fetcherConfig.newIndexFetcher(); } @Override diff --git a/src/main/java/org/tallison/cc/index/selector/ExtensionsSelector.java b/src/main/java/org/tallison/cc/index/selector/ExtensionsSelector.java new file mode 100644 index 0000000..618a2ac --- /dev/null +++ b/src/main/java/org/tallison/cc/index/selector/ExtensionsSelector.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.tallison.cc.index.selector; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.commons.io.FilenameUtils; + +import org.apache.tika.utils.StringUtils; + +/** + * Comma delimited list of extensions to accept. Extensions are currently matched case insensitively. + */ +public class ExtensionsSelector extends AbstractSamplingSelector { + final Set extensions = new HashSet<>(); + + @JsonCreator + public ExtensionsSelector(@JsonProperty("extensions") String commaDelimitedExtensions, + @JsonProperty("sample") Double sample) { + super(sample == null ? new SampleAll() : new SampleSome(sample)); + for (String ext : commaDelimitedExtensions.split(",")) { + if (!StringUtils.isBlank(ext)) { + extensions.add(ext.toLowerCase(Locale.ROOT)); + } + } + } + + @Override + public boolean select(String val) { + String ext = FilenameUtils.getExtension(val); + if (! StringUtils.isBlank(ext)) { + ext = ext.toLowerCase(Locale.ROOT); + if (extensions.contains(ext)) { + return sampler.select(val); + } + } + return false; + } +} diff --git a/src/main/java/org/tallison/cc/index/selector/SelectorClause.java b/src/main/java/org/tallison/cc/index/selector/SelectorClause.java index 32f2536..cd59f2f 100644 --- a/src/main/java/org/tallison/cc/index/selector/SelectorClause.java +++ b/src/main/java/org/tallison/cc/index/selector/SelectorClause.java @@ -20,7 +20,9 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; @JsonTypeInfo(use = JsonTypeInfo.Id.DEDUCTION) -@JsonSubTypes({@JsonSubTypes.Type(MatchSelector.class), @JsonSubTypes.Type(RegexSelector.class)}) +@JsonSubTypes( + {@JsonSubTypes.Type(MatchSelector.class), @JsonSubTypes.Type(RegexSelector.class), + @JsonSubTypes.Type(ExtensionsSelector.class)}) public interface SelectorClause { boolean select(String val); diff --git a/src/test/java/org/tallison/cc/index/selector/IndexRecordSelectorTest.java b/src/test/java/org/tallison/cc/index/selector/IndexRecordSelectorTest.java index 4da23dd..d895bb2 100644 --- a/src/test/java/org/tallison/cc/index/selector/IndexRecordSelectorTest.java +++ b/src/test/java/org/tallison/cc/index/selector/IndexRecordSelectorTest.java @@ -30,15 +30,12 @@ import org.junit.jupiter.api.Test; import org.tallison.cc.index.CCIndexRecord; -import org.apache.tika.utils.StringUtils; - - public class IndexRecordSelectorTest { @Test public void testBasic() throws Exception { ObjectMapper mapper = new ObjectMapper(); - + //this just tests that the deserialization works RecordSelector recordSelector = mapper.readValue(getClass().getResourceAsStream("/selectors/basic.json"), RecordSelector.class); @@ -47,7 +44,12 @@ public void testBasic() throws Exception { @Test @Disabled("for development only") public void testIndexFile() throws Exception { - Path p = Paths.get("/Users/allison/data/cc/index-work/cdx-00000.gz"); + Path p = Paths.get("/...CC-MAIN-2023-06/indexes/cdx-00000.gz"); + ObjectMapper mapper = new ObjectMapper(); + //this just tests that the deserialization works + RecordSelector recordSelector = + mapper.readValue(getClass().getResourceAsStream("/selectors/extensions.json"), + RecordSelector.class); try (BufferedReader r = new BufferedReader( new InputStreamReader(new GZIPInputStream(Files.newInputStream(p)), StandardCharsets.UTF_8))) { @@ -56,11 +58,10 @@ public void testIndexFile() throws Exception { Optional record = CCIndexRecord.parseRecord(line); if (record.isPresent()) { CCIndexRecord indexRecord = record.get(); - if (!indexRecord.getMime().equals(indexRecord.getMimeDetected())) { - System.out.println(line); - } - if (!StringUtils.isBlank(indexRecord.getTruncated())) { - + if (recordSelector.select(indexRecord)) { + System.out.println("Selected: " + indexRecord); + } else { + //System.out.println("Rejected: " + indexRecord.getUrl()); } } line = r.readLine(); diff --git a/src/test/resources/selectors/basic.json b/src/test/resources/selectors/basic.json index 25c15c0..6a6c0bb 100644 --- a/src/test/resources/selectors/basic.json +++ b/src/test/resources/selectors/basic.json @@ -5,7 +5,12 @@ "match": "application/pdf", "sample": 0.8 } - ] + ], + "url": [ + { + "extensions": "exe,bat" + } + ] }, "must": { "status": [ diff --git a/src/test/resources/selectors/extensions.json b/src/test/resources/selectors/extensions.json new file mode 100644 index 0000000..dbede2d --- /dev/null +++ b/src/test/resources/selectors/extensions.json @@ -0,0 +1,16 @@ +{ + "should": { + "url": [ + { + "extensions": "exe,bat" + } + ] + }, + "must": { + "status": [ + { + "match": "200" + } + ] + } +} \ No newline at end of file