Skip to content

Commit

Permalink
Merge pull request #50 from tballison/add-extensions-selector
Browse files Browse the repository at this point in the history
This fixes issue #48
  • Loading branch information
tballison authored Oct 18, 2024
2 parents 8d061ef + 9315d66 commit e9f20bf
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 22 deletions.
19 changes: 10 additions & 9 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,16 @@
<httpcomponents.version>4.4.14</httpcomponents.version>
<jackson.version>2.18.0</jackson.version>
<jupiter.version>5.11.2</jupiter.version>
<log4j2.version>2.24.1</log4j2.version>
<tika.version>2.9.2</tika.version>
<maven.shade.version>3.4.1</maven.shade.version>
<maven.shade.version>3.6.0</maven.shade.version>
</properties>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
<version>1.3.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
Expand Down Expand Up @@ -76,7 +77,7 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.23.0</version>
<version>1.27.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
Expand Down Expand Up @@ -142,12 +143,12 @@
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.20.0</version>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>2.20.0</version>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down Expand Up @@ -201,7 +202,7 @@
<!-- mvn validate -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>3.2.2</version>
<version>3.5.0</version>
<dependencies>
<dependency>
<groupId>com.puppycrawl.tools</groupId>
Expand Down Expand Up @@ -232,7 +233,7 @@
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>3.5.1</version>
<version>3.8</version>
<configuration>
<targetVersion>${maven.compiler.target}</targetVersion>
<ignoreSignaturesOfMissingClasses>true</ignoreSignaturesOfMissingClasses>
Expand Down Expand Up @@ -271,15 +272,15 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.3.0</version> <!-- can't update to 3.1.0, see MENFORCER-393 -->
<version>3.5.0</version>
<executions>
<execution>
<id>enforce-maven</id>
<configuration>
<rules>
<dependencyConvergence />
<requireMavenVersion>
<version>3.5</version>
<version>3.7</version>
</requireMavenVersion>
</rules>
</configuration>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ private static class IndexWorker implements Callable<Long> {
AbstractRecordProcessor recordProcessor) throws TikaException {
this.indexUrls = indexUrls;
this.recordProcessor = recordProcessor;
this.fetcher = fetcherConfig.newFetcher();
this.fetcher = fetcherConfig.newIndexFetcher();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.cc.index.selector;

import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.io.FilenameUtils;

import org.apache.tika.utils.StringUtils;

/**
* Comma delimited list of extensions to accept. Extensions are currently matched case insensitively.
*/
public class ExtensionsSelector extends AbstractSamplingSelector {
final Set<String> extensions = new HashSet<>();

@JsonCreator
public ExtensionsSelector(@JsonProperty("extensions") String commaDelimitedExtensions,
@JsonProperty("sample") Double sample) {
super(sample == null ? new SampleAll() : new SampleSome(sample));
for (String ext : commaDelimitedExtensions.split(",")) {
if (!StringUtils.isBlank(ext)) {
extensions.add(ext.toLowerCase(Locale.ROOT));
}
}
}

@Override
public boolean select(String val) {
String ext = FilenameUtils.getExtension(val);
if (! StringUtils.isBlank(ext)) {
ext = ext.toLowerCase(Locale.ROOT);
if (extensions.contains(ext)) {
return sampler.select(val);
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
import com.fasterxml.jackson.annotation.JsonTypeInfo;

@JsonTypeInfo(use = JsonTypeInfo.Id.DEDUCTION)
@JsonSubTypes({@JsonSubTypes.Type(MatchSelector.class), @JsonSubTypes.Type(RegexSelector.class)})
@JsonSubTypes(
{@JsonSubTypes.Type(MatchSelector.class), @JsonSubTypes.Type(RegexSelector.class),
@JsonSubTypes.Type(ExtensionsSelector.class)})
public interface SelectorClause {

boolean select(String val);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,12 @@
import org.junit.jupiter.api.Test;
import org.tallison.cc.index.CCIndexRecord;

import org.apache.tika.utils.StringUtils;


public class IndexRecordSelectorTest {

@Test
public void testBasic() throws Exception {
ObjectMapper mapper = new ObjectMapper();

//this just tests that the deserialization works
RecordSelector recordSelector =
mapper.readValue(getClass().getResourceAsStream("/selectors/basic.json"),
RecordSelector.class);
Expand All @@ -47,7 +44,12 @@ public void testBasic() throws Exception {
@Test
@Disabled("for development only")
public void testIndexFile() throws Exception {
Path p = Paths.get("/Users/allison/data/cc/index-work/cdx-00000.gz");
Path p = Paths.get("/...CC-MAIN-2023-06/indexes/cdx-00000.gz");
ObjectMapper mapper = new ObjectMapper();
//this just tests that the deserialization works
RecordSelector recordSelector =
mapper.readValue(getClass().getResourceAsStream("/selectors/extensions.json"),
RecordSelector.class);
try (BufferedReader r = new BufferedReader(
new InputStreamReader(new GZIPInputStream(Files.newInputStream(p)),
StandardCharsets.UTF_8))) {
Expand All @@ -56,11 +58,10 @@ public void testIndexFile() throws Exception {
Optional<CCIndexRecord> record = CCIndexRecord.parseRecord(line);
if (record.isPresent()) {
CCIndexRecord indexRecord = record.get();
if (!indexRecord.getMime().equals(indexRecord.getMimeDetected())) {
System.out.println(line);
}
if (!StringUtils.isBlank(indexRecord.getTruncated())) {

if (recordSelector.select(indexRecord)) {
System.out.println("Selected: " + indexRecord);
} else {
//System.out.println("Rejected: " + indexRecord.getUrl());
}
}
line = r.readLine();
Expand Down
7 changes: 6 additions & 1 deletion src/test/resources/selectors/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
"match": "application/pdf",
"sample": 0.8
}
]
],
"url": [
{
"extensions": "exe,bat"
}
]
},
"must": {
"status": [
Expand Down
16 changes: 16 additions & 0 deletions src/test/resources/selectors/extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"should": {
"url": [
{
"extensions": "exe,bat"
}
]
},
"must": {
"status": [
{
"match": "200"
}
]
}
}

0 comments on commit e9f20bf

Please sign in to comment.