From 23dca324b61ebae370541b4b2a81a521e2d36636 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 26 May 2019 19:00:47 +0200 Subject: [PATCH 001/102] created SparqlConfiguration class and chnages in docker file --- docker-compose-sparql.yml | 6 +- .../components/FrontierComponent.java | 3 + .../configurator/SparqlConfiguration.java | 102 ++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index f7ec5f9a9..5d24cf0b8 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -16,6 +16,10 @@ services: - HOBBIT_RABBIT_HOST=rabbit - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt - SEED_FILE=/var/squirrel/seeds.txt + - SPARQL_HOST_NAME=my_virtdb/sparql-auth + - SPARQL_HOST_PORT=8890 + - SPARQL_HOST_USER=DBA + - SPARQL_HOST_PASSWD=123pwd - MDB_HOST_NAME=mongodb - MDB_PORT=27017 - MDB_CONNECTION_TIME_OUT=5000 @@ -157,4 +161,4 @@ services: SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" volumes: - ./data/deduplicator:/var/squirrel/data - command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent + command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent \ No newline at end of file diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index acffbc0ef..6f36803ef 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -13,6 +13,7 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.configurator.SeedConfiguration; +import org.dice_research.squirrel.configurator.SparqlConfiguration; import org.dice_research.squirrel.configurator.WebConfiguration; import org.dice_research.squirrel.configurator.WhiteListConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -64,6 +65,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; + String sparqlEndpointUrl = "http://localhost:8890/sparql"; public static final boolean RECRAWLING_ACTIVE = true; @@ -73,6 +75,7 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); + SparqlConfiguration sparqlConfiguration = SparqlConfiguration.create(sparqlEndpointUrl); if (mongoConfiguration != null) { String dbHostName = mongoConfiguration.getMDBHostName(); Integer dbPort = mongoConfiguration.getMDBPort(); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java new file mode 100644 index 000000000..dde2aa4f2 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java @@ -0,0 +1,102 @@ +package org.dice_research.squirrel.configurator; + +import java.net.URI; + +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryHttp; +import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.Credentials; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CredentialsProvider; +import org.apache.http.impl.client.AbstractHttpClient; +import org.apache.http.protocol.HttpContext; +import org.apache.jena.atlas.web.auth.HttpAuthenticator; +import org.apache.jena.sparql.core.DatasetDescription; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressWarnings("deprecation") +public class SparqlConfiguration { + + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConfiguration.class); + + /** + * The Query factory used to query the SPARQL endpoint. + */ + protected static QueryExecutionFactory queryExecFactory = null; + + protected UpdateExecutionFactory updateExecFactory = null; + + + protected SparqlConfiguration(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { + this.queryExecFactory = queryExecFactory; + this.updateExecFactory = updateExecFactory; + } + + public static SparqlConfiguration create(String sparqlEndpointUrl) { + + return create(sparqlEndpointUrl, null, null); + } + + public static SparqlConfiguration create(String sparqlEndpointUrl, String username, String password) { + QueryExecutionFactory queryExecFactory = null; + UpdateExecutionFactory updateExecFactory = null; + if (username != null && password != null) { + // Create the factory with the credentials + final Credentials credentials = new UsernamePasswordCredentials(username, password); + HttpAuthenticator authenticator = new HttpAuthenticator() { + @Override + public void invalidate() { + } + + @Override + public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { + client.setCredentialsProvider(new CredentialsProvider() { + @Override + public void clear() { + } + + @Override + public Credentials getCredentials(AuthScope scope) { + return credentials; + } + + @Override + public void setCredentials(AuthScope arg0, Credentials arg1) { + LOGGER.error("I am a read-only credential provider but got a call to set credentials."); + } + }); + } + }; + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), + authenticator); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); + } else { + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); + } + return new SparqlConfiguration(queryExecFactory, updateExecFactory); + } +//public static void main(String args[]) { +// String sparqlEndpointUrl = "http://localhost:8890/sparql/"; +// SparqlConfiguration.create(sparqlEndpointUrl); +// +// String queryString = "select distinct ?Concept where {[] a ?Concept} LIMIT 100"; +// +// QueryExecution qe = queryExecFactory.createQueryExecution(queryString); +// System.out.println(qe); +// ResultSet rs = qe.execSelect(); +// System.out.println("rs"+rs); +// +// while (rs.hasNext()) { +// QuerySolution sol = rs.nextSolution(); +// RDFNode subject = sol.get("Concept"); +// System.out.println(subject); +// } +// +//} +} + + \ No newline at end of file From d890afcfecad30ebd94ddb9bc057882a3050fdbe Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 29 May 2019 14:27:08 +0200 Subject: [PATCH 002/102] change sparqlendpoint in docker-compose-sparql --- docker-compose-sparql.yml | 53 +++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 5d24cf0b8..261fa824e 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -16,10 +16,6 @@ services: - HOBBIT_RABBIT_HOST=rabbit - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt - SEED_FILE=/var/squirrel/seeds.txt - - SPARQL_HOST_NAME=my_virtdb/sparql-auth - - SPARQL_HOST_PORT=8890 - - SPARQL_HOST_USER=DBA - - SPARQL_HOST_PASSWD=123pwd - MDB_HOST_NAME=mongodb - MDB_PORT=27017 - MDB_CONNECTION_TIME_OUT=5000 @@ -45,16 +41,16 @@ services: - Driver=/usr/local/lib/virtodbc_32.so - DBA_PASSWORD=123pwd -# sparqlhost: -# image: stain/jena-fuseki -# container_name: sparqlhost -# ports: -# - "3030:3030" -# volumes: -# - ./data/sparqlhost/sparqlhost_data:/fuseki -# environment: -# - ADMIN_PASSWORD=pw123 -# - JVM_ARGS=-Xmx2g + sparqlhost: + image: stain/jena-fuseki + container_name: sparqlhost + ports: + - "3030:3030" + volumes: + - ./data/sparqlhost/sparqlhost_data:/fuseki + environment: + - ADMIN_PASSWORD=pw123 + - JVM_ARGS=-Xmx2g mongodb: image: mongo:4.0.0 @@ -89,10 +85,9 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_HOST_NAME=my_virtdb/sparql-auth - - SPARQL_HOST_PORT=8890 - - SPARQL_HOST_USER=DBA - - SPARQL_HOST_PASSWD=123pwd + - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_HOST_USER=admin + - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - MDB_HOST_NAME=mongodb - MDB_PORT=27017 @@ -111,10 +106,9 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_HOST_NAME=my_virtdb/sparql-auth - - SPARQL_HOST_PORT=8890 - - SPARQL_HOST_USER=DBA - - SPARQL_HOST_PASSWD=123pwd + - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_HOST_USER=admin + - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - MDB_HOST_NAME=mongodb - MDB_PORT=27017 @@ -133,10 +127,9 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_HOST_NAME=my_virtdb/sparql-auth - - SPARQL_HOST_PORT=8890 - - SPARQL_HOST_USER=DBA - - SPARQL_HOST_PASSWD=123pwd + - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_HOST_USER=admin + - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=true - MDB_HOST_NAME=mongodb - MDB_PORT=27017 @@ -148,17 +141,17 @@ services: command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter deduplicator: - image: squirrel + image: squirrel.deduplication container_name: deduplicator environment: DEDUPLICATION_ACTIVE: "true" HOBBIT_RABBIT_HOST: rabbit OUTPUT_FOLDER: /var/squirrel/data + CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml MDB_HOST_NAME: mongodb MDB_PORT: 27017 SPARQL_HOST_NAME: sparqlhost SPARQL_HOST_PORT: 3030 - SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" + SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" volumes: - - ./data/deduplicator:/var/squirrel/data - command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent \ No newline at end of file + - ./data/deduplicator:/var/squirrel/data \ No newline at end of file From f7e9b3f181a11d6b971fed57b2d078cd5d2b14c8 Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 29 May 2019 14:32:16 +0200 Subject: [PATCH 003/102] change sparqlendpoint in docker-compose-sparql --- docker-compose-sparql.yml | 60 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 261fa824e..dbca40045 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -15,7 +15,11 @@ services: environment: - HOBBIT_RABBIT_HOST=rabbit - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt + # - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - SEED_FILE=/var/squirrel/seeds.txt + - SPARQL_URL=http://sparqlhost:3030/Metadata/query + - SPARQL_HOST_USER=admin + - SPARQL_HOST_PASSWD=pw123 - MDB_HOST_NAME=mongodb - MDB_PORT=27017 - MDB_CONNECTION_TIME_OUT=5000 @@ -30,16 +34,16 @@ services: - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.FrontierComponent - virtuosohost: - image: openlink/virtuoso-opensource-7:latest - container_name: virtuosohost - ports: - - "8890:8890" - volumes: - - ./data/sparqlhost/sparqlhost_data:/virtuoso - environment: - - Driver=/usr/local/lib/virtodbc_32.so - - DBA_PASSWORD=123pwd +# virtuosohost: +# image: openlink/virtuoso-opensource-7:latest +# container_name: virtuosohost +# ports: +# - "8890:8890" +# volumes: +# - ./data/sparqlhost/sparqlhost_data:/virtuoso +# environment: +# - Driver=/usr/local/lib/virtodbc_32.so +# - DBA_PASSWORD=123pwd sparqlhost: image: stain/jena-fuseki @@ -85,7 +89,7 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_URL=http://sparqlhost:3030/Metadata/update - SPARQL_HOST_USER=admin - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false @@ -106,7 +110,7 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_URL=http://sparqlhost:3030/Metadata/update - SPARQL_HOST_USER=admin - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false @@ -127,7 +131,7 @@ services: - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/squirrel/update + - SPARQL_URL=http://sparqlhost:3030/Metadata/update - SPARQL_HOST_USER=admin - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=true @@ -140,18 +144,18 @@ services: - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - deduplicator: - image: squirrel.deduplication - container_name: deduplicator - environment: - DEDUPLICATION_ACTIVE: "true" - HOBBIT_RABBIT_HOST: rabbit - OUTPUT_FOLDER: /var/squirrel/data - CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml - MDB_HOST_NAME: mongodb - MDB_PORT: 27017 - SPARQL_HOST_NAME: sparqlhost - SPARQL_HOST_PORT: 3030 - SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" - volumes: - - ./data/deduplicator:/var/squirrel/data \ No newline at end of file +# deduplicator: +# image: squirrel.deduplication +# container_name: deduplicator +# environment: +# DEDUPLICATION_ACTIVE: "true" +# HOBBIT_RABBIT_HOST: rabbit +# OUTPUT_FOLDER: /var/squirrel/data +# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml +# MDB_HOST_NAME: mongodb +# MDB_PORT: 27017 +# SPARQL_HOST_NAME: sparqlhost +# SPARQL_HOST_PORT: 3030 +# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" +#volumes: +# - ./data/deduplicator:/var/squirrel/data \ No newline at end of file From 8c5ee7bc8b4d5006d9ccd137f48173827ddd3acb Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 2 Jun 2019 10:57:32 +0200 Subject: [PATCH 004/102] add FrontierQueryGenerator class to generate timestamp retreival query and changes in frontier-context.xml to set env variables --- bin/docker-compose-sparql.yml | 200 ++++++++ docker-compose-sparql.yml | 78 +-- spring-config/frontier-context.xml | 10 +- .../components/FrontierComponent.java | 475 +++++++++--------- .../configurator/SparqlConfiguration.java | 221 +++++--- .../frontier/impl/FrontierQueryGenerator.java | 192 +++++++ 6 files changed, 801 insertions(+), 375 deletions(-) create mode 100644 bin/docker-compose-sparql.yml create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java diff --git a/bin/docker-compose-sparql.yml b/bin/docker-compose-sparql.yml new file mode 100644 index 000000000..1cebd57d6 --- /dev/null +++ b/bin/docker-compose-sparql.yml @@ -0,0 +1,200 @@ +version: "2" + +services: + #debugger: + # image: sjourdan/toolbox + # container_name: debugger + # networks: + # - squirrel:latest + # dns: 8.8.8.8 + # command: nc -l 50070 + + frontier: + image: squirrel.frontier:latest + container_name: frontier + environment: + - HOBBIT_RABBIT_HOST=rabbit + - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt +<<<<<<< HEAD + # - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml +======= + - FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context.xml +>>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 + - SEED_FILE=/var/squirrel/seeds.txt + - SPARQL_URL=http://sparqlhost:3030/Metadata/query + - SPARQL_HOST_USER=admin + - SPARQL_HOST_PASSWD=pw123 + - MDB_HOST_NAME=mongodb + - MDB_PORT=27017 + - MDB_CONNECTION_TIME_OUT=5000 + - MDB_SOCKET_TIME_OUT=10000 + - MDB_SERVER_TIME_OUT=10000 + - QUEUE_FILTER_PERSIST=true + - COMMUNICATION_WITH_WEBSERVICE=false + - VISUALIZATION_OF_CRAWLED_GRAPH=false + - JVM_ARGS=-Xmx8g + volumes: + - ./data/frontier:/var/squirrel/data + - ./seed/seeds.txt:/var/squirrel/seeds.txt:ro + - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro + command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter + +# virtuosohost: +# image: openlink/virtuoso-opensource-7:latest +# container_name: virtuosohost +# ports: +# - "8890:8890" +# volumes: +# - ./data/sparqlhost/sparqlhost_data:/virtuoso +# environment: +# - Driver=/usr/local/lib/virtodbc_32.so +# - DBA_PASSWORD=123pwd + + sparqlhost: + image: stain/jena-fuseki + container_name: sparqlhost + ports: + - "3030:3030" + volumes: + - ./data/sparqlhost/sparqlhost_data:/fuseki + environment: +<<<<<<< HEAD + - ADMIN_PASSWORD=pw123 + - JVM_ARGS=-Xmx2g +======= + - Driver=/usr/local/lib/virtodbc_32.so + - DBA_PASSWORD=pw123 + +# sparqlhost: +# image: stain/jena-fuseki +# container_name: sparqlhost +# ports: +# - "3030:3030" +# volumes: +# - ./data/sparqlhost/sparqlhost_data:/fuseki +# environment: +# - ADMIN_PASSWORD=pw123 +# - JVM_ARGS=-Xmx2g +>>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 + + mongodb: + image: mongo:4.0.0 + volumes: + - ./data/mongodb:/data + ports: + - "27017:27017" + + rethinkdb: + image: rethinkdb:2.3.5 + volumes: + - ./data/rethinkdb:/data + ports: + - "8080:8080" + command: rethinkdb --bind all + + # message bus + rabbit: + image: rabbitmq:management + container_name: rabbit + hostname: rabbit + ports: + - "8081:15672" + # Forwarding the port for testing + - "5672:5672" + + worker1: + image: squirrel.worker:latest + container_name: worker1 + environment: + - HOBBIT_RABBIT_HOST=rabbit + - OUTPUT_FOLDER=/var/squirrel/data + - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml +<<<<<<< HEAD + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml + - SPARQL_URL=http://sparqlhost:3030/Metadata/update + - SPARQL_HOST_USER=admin +======= + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml + - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ + - SPARQL_HOST_USER=dba +>>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 + - SPARQL_HOST_PASSWD=pw123 + - DEDUPLICATION_ACTIVE=false + - MDB_HOST_NAME=mongodb + - MDB_PORT=27017 + - JVM_ARGS=-Xmx8g + volumes: + - ./data/worker1:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config + command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter + + worker2: + image: squirrel.worker:latest + container_name: worker2 + environment: + - HOBBIT_RABBIT_HOST=rabbit + - OUTPUT_FOLDER=/var/squirrel/data + - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml +<<<<<<< HEAD + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml + - SPARQL_URL=http://sparqlhost:3030/Metadata/update + - SPARQL_HOST_USER=admin +======= + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml + - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ + - SPARQL_HOST_USER=dba +>>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 + - SPARQL_HOST_PASSWD=pw123 + - DEDUPLICATION_ACTIVE=false + - MDB_HOST_NAME=mongodb + - MDB_PORT=27017 + - JVM_ARGS=-Xmx8g + volumes: + - ./data/worker2:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config + command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter + + worker3: + image: squirrel.worker:latest + container_name: worker3 + environment: + - HOBBIT_RABBIT_HOST=rabbit + - OUTPUT_FOLDER=/var/squirrel/data + - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml +<<<<<<< HEAD + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml + - SPARQL_URL=http://sparqlhost:3030/Metadata/update + - SPARQL_HOST_USER=admin +======= + - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml + - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ + - SPARQL_HOST_USER=dba +>>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 + - SPARQL_HOST_PASSWD=pw123 + - DEDUPLICATION_ACTIVE=true + - MDB_HOST_NAME=mongodb + - MDB_PORT=27017 + - JVM_ARGS=-Xmx8g + volumes: + - ./data/worker3:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config + command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter + +# deduplicator: +# image: squirrel.deduplication +# container_name: deduplicator +# environment: +# DEDUPLICATION_ACTIVE: "true" +# HOBBIT_RABBIT_HOST: rabbit +# OUTPUT_FOLDER: /var/squirrel/data +# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml +# MDB_HOST_NAME: mongodb +# MDB_PORT: 27017 +# SPARQL_HOST_NAME: sparqlhost +# SPARQL_HOST_PORT: 3030 +# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" +#volumes: +# - ./data/deduplicator:/var/squirrel/data \ No newline at end of file diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 1cebd57d6..8ce9dbd0f 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -15,14 +15,10 @@ services: environment: - HOBBIT_RABBIT_HOST=rabbit - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt -<<<<<<< HEAD - # - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml -======= - FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context.xml ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - SEED_FILE=/var/squirrel/seeds.txt - - SPARQL_URL=http://sparqlhost:3030/Metadata/query - - SPARQL_HOST_USER=admin + - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ + - SPARQL_HOST_USER=dba - SPARQL_HOST_PASSWD=pw123 - MDB_HOST_NAME=mongodb - MDB_PORT=27017 @@ -39,29 +35,14 @@ services: - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter -# virtuosohost: -# image: openlink/virtuoso-opensource-7:latest -# container_name: virtuosohost -# ports: -# - "8890:8890" -# volumes: -# - ./data/sparqlhost/sparqlhost_data:/virtuoso -# environment: -# - Driver=/usr/local/lib/virtodbc_32.so -# - DBA_PASSWORD=123pwd - - sparqlhost: - image: stain/jena-fuseki - container_name: sparqlhost + virtuosohost: + image: openlink/virtuoso-opensource-7:latest + container_name: virtuosohost ports: - - "3030:3030" + - "8890:8890" volumes: - - ./data/sparqlhost/sparqlhost_data:/fuseki + - ./data/sparqlhost/sparqlhost_data:/virtuoso environment: -<<<<<<< HEAD - - ADMIN_PASSWORD=pw123 - - JVM_ARGS=-Xmx2g -======= - Driver=/usr/local/lib/virtodbc_32.so - DBA_PASSWORD=pw123 @@ -75,7 +56,6 @@ services: # environment: # - ADMIN_PASSWORD=pw123 # - JVM_ARGS=-Xmx2g ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 mongodb: image: mongo:4.0.0 @@ -109,15 +89,9 @@ services: - HOBBIT_RABBIT_HOST=rabbit - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - MDB_HOST_NAME=mongodb @@ -136,15 +110,9 @@ services: - HOBBIT_RABBIT_HOST=rabbit - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - MDB_HOST_NAME=mongodb @@ -163,15 +131,9 @@ services: - HOBBIT_RABBIT_HOST=rabbit - OUTPUT_FOLDER=/var/squirrel/data - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=true - MDB_HOST_NAME=mongodb @@ -184,17 +146,17 @@ services: command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter # deduplicator: -# image: squirrel.deduplication -# container_name: deduplicator -# environment: -# DEDUPLICATION_ACTIVE: "true" -# HOBBIT_RABBIT_HOST: rabbit -# OUTPUT_FOLDER: /var/squirrel/data -# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml -# MDB_HOST_NAME: mongodb +# image: squirrel +# container_name: deduplicator +# environment: +# DEDUPLICATION_ACTIVE: "true" +# HOBBIT_RABBIT_HOST: rabbit +# OUTPUT_FOLDER: /var/squirrel/data +# MDB_HOST_NAME: mongodb # MDB_PORT: 27017 -# SPARQL_HOST_NAME: sparqlhost -# SPARQL_HOST_PORT: 3030 -# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" -#volumes: -# - ./data/deduplicator:/var/squirrel/data \ No newline at end of file +# SPARQL_HOST_NAME: sparqlhost +# SPARQL_HOST_PORT: 3030 +# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" +# volumes: +# - ./data/deduplicator:/var/squirrel/data +# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent \ No newline at end of file diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 9c84264a1..b9ce4fd57 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -25,8 +25,8 @@ - - + + + + + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index ad17ab04c..545992ef3 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -11,6 +11,8 @@ import java.util.TimerTask; import java.util.concurrent.Semaphore; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; @@ -57,244 +59,247 @@ @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); - - @Qualifier("queueBean") - @Autowired - protected UriQueue queue; - @Qualifier("knowUriFilterBean") - @Autowired - private KnownUriFilter knownUriFilter; - private URIReferences uriReferences = null; - private Frontier frontier; - private RabbitQueue rabbitQueue; - private DataReceiver receiver; - @Qualifier("serializerBean") - @Autowired - private Serializer serializer; - private final Semaphore terminationMutex = new Semaphore(0); - private final WorkerGuard workerGuard = new WorkerGuard(this); - private final boolean doRecrawling = true; - private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; -<<<<<<< HEAD - String sparqlEndpointUrl = "http://localhost:8890/sparql"; -======= - - private Map hasUrisToCrawl; ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - public static final boolean RECRAWLING_ACTIVE = true; - - - @Override - public void init() throws Exception { - super.init(); - serializer = new GzipJavaUriSerializer(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); -<<<<<<< HEAD - SparqlConfiguration sparqlConfiguration = SparqlConfiguration.create(sparqlEndpointUrl); -======= - hasUrisToCrawl = new HashMap(); ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - if (mongoConfiguration != null) { - - queue.open(); - knownUriFilter.open(); - - - WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); - if (whiteListConfiguration != null) { - File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); - knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); - } - - // TODO Reactivate me but with a different configuration - // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { - // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); - // uriReferences.open(); - // } - } else { - LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); - queue = new InMemoryQueue(); - knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); - } - - // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling); - - rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); - receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) - .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); - - SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); - if (seedConfiguration != null) { - processSeedFile(seedConfiguration.getSeedFile()); - } - - LOGGER.info("Frontier initialized."); - - if (webConfiguration.isCommunicationWithWebserviceEnabled()) { - final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, - workerGuard, queue, knownUriFilter, uriReferences); - LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " - + webConfiguration.isVisualizationOfCrawledGraphEnabled()); - Thread senderThread = new Thread(sender); - senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); - senderThread.start(); - LOGGER.info("Started thread [" + senderThread.getName() + "] "); - } else { - LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " - + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" - + webConfiguration.isVisualizationOfCrawledGraphEnabled() - + ". No WebServiceSenderThread will be started!"); - } - } - - @Override - public void run() throws Exception { - TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); - Timer timer = new Timer(); - timer.schedule(terminatorTask, 5000,5000); - terminationMutex.acquire(); - timer.cancel(); - } - - @Override - public void close() throws IOException { - LOGGER.info("Closing Frontier Component."); - if (receiver != null) - // Force the receiver to close - receiver.close(); -// receiver.closeWhenFinished(); - - if (queue != null) - queue.close(); - if (uriReferences != null) - uriReferences.close(); - if (knownUriFilter instanceof Closeable) { - ((Closeable) knownUriFilter).close(); - } - workerGuard.shutdown(); - if (frontier != null) - frontier.close(); - super.close(); - LOGGER.info("Frontier Component Closed."); - } - - @Override - public void handleData(byte[] data) { - handleData(data, null, null, null); - } - - @Override - public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { - Object deserializedData; - try { - deserializedData = serializer.deserialize(data); - } catch (IOException e) { - // try to convert the string into a single URI, that maybe comes from the - // WebService - // CrawleableUri uri = new CrawleableUriFactoryImpl().create(new String(data)); - // if (uri != null) { - // LOGGER.warn("Received a single URI " + uri.getUri() + " without a wrapping of - // \"org.aksw.simba.squirrel.rabbit.frontier\". We converted it into a - // UriSet."); - // deserializedData = new UriSet(Collections.singletonList(uri)); - // } else { - LOGGER.error("Error while trying to deserialize incoming data. It will be ignored.", e); - return; - // } - } - - if (deserializedData != null) { - if (deserializedData instanceof UriSetRequest) { - responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); - } else if (deserializedData instanceof UriSet) { -// LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); - frontier.addNewUris(((UriSet) deserializedData).uris); - } else if (deserializedData instanceof CrawlingResult) { - CrawlingResult crawlingResult = (CrawlingResult) deserializedData; - LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); - frontier.crawlingDone(crawlingResult.uris); - workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); - } else if (deserializedData instanceof AliveMessage) { - AliveMessage message = (AliveMessage) deserializedData; - String idReceived = message.getWorkerId(); - LOGGER.warn("Received alive message from worker with id " + idReceived); - workerGuard.putNewTimestamp(idReceived); - } else { - LOGGER.warn("Received an unknown object {}. It will be ignored.", deserializedData.toString()); - } - } - } - - private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, - UriSetRequest uriSetRequest) { - if (handler != null) { - // get next UriSet - try { - List uris = frontier.getNextUris(); - LOGGER.trace("Responding with a list of {} uris.", - uris == null ? "null" : Integer.toString(uris.size())); - handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); - if (uris != null && uris.size() > 0) { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), true); - workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), - uriSetRequest.workerSendsAliveMessages(), uris); - }else { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), false); - } - } catch (IOException e) { - LOGGER.error("Couldn't serialize new URI set.", e); - } - } else { - LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); - } - } - - protected void processSeedFile(String seedFile) { - try { - List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); - frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); - } catch (Exception e) { - LOGGER.error("Couldn't process seed file. It will be ignored.", e); - } - } - - public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) { - if (frontier instanceof ExtendedFrontier) { - ((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign); - } - } - - public void setFrontier(FrontierImpl frontier) { - this.frontier = frontier; - } - - public WorkerGuard getWorkerGuard() { - return workerGuard; - } - - private class TerminatorTask extends TimerTask{ - - private UriQueue queue; - private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); - private Semaphore terminationMutex; - - public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { - this.queue = queue; - this.terminationMutex = terminationMutex; + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); + + @Qualifier("queueBean") + @Autowired + protected UriQueue queue; + @Qualifier("knowUriFilterBean") + @Autowired + private KnownUriFilter knownUriFilter; + private URIReferences uriReferences = null; + private Frontier frontier; + private RabbitQueue rabbitQueue; + private DataReceiver receiver; + @Qualifier("sparqlBean") + @Autowired + private static SparqlConfiguration sparqlConfig; + @Qualifier("serializerBean") + @Autowired + private Serializer serializer; + private final Semaphore terminationMutex = new Semaphore(0); + private final WorkerGuard workerGuard = new WorkerGuard(this); + private final boolean doRecrawling = true; + private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; + + private Map hasUrisToCrawl; + + public static final boolean RECRAWLING_ACTIVE = true; + protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; + + protected static QueryExecutionFactory queryExecFactory = null; + protected UpdateExecutionFactory updateExecFactory = null; + + + @Override + public void init() throws Exception { + super.init(); + serializer = new GzipJavaUriSerializer(); + MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); + hasUrisToCrawl = new HashMap(); + + if (mongoConfiguration != null) { + + queue.open(); + knownUriFilter.open(); + + + WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); + if (whiteListConfiguration != null) { + File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); + knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); + } + + // TODO Reactivate me but with a different configuration + // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { + // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); + // uriReferences.open(); + // } + } else { + LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); + queue = new InMemoryQueue(); + knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); + } + + // Build frontier + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling); + + rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); + receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) + .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); + + SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); + if (seedConfiguration != null) { + processSeedFile(seedConfiguration.getSeedFile()); + } + + LOGGER.info("Frontier initialized."); + + if (webConfiguration.isCommunicationWithWebserviceEnabled()) { + final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, + workerGuard, queue, knownUriFilter, uriReferences); + LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " + + webConfiguration.isVisualizationOfCrawledGraphEnabled()); + Thread senderThread = new Thread(sender); + senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); + senderThread.start(); + LOGGER.info("Started thread [" + senderThread.getName() + "] "); + } else { + LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " + + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" + + webConfiguration.isVisualizationOfCrawledGraphEnabled() + + ". No WebServiceSenderThread will be started!"); + } + + } + + + + @Override + public void run() throws Exception { + TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); + Timer timer = new Timer(); + timer.schedule(terminatorTask, 5000,5000); + terminationMutex.acquire(); + timer.cancel(); + } + + @Override + public void close() throws IOException { + LOGGER.info("Closing Frontier Component."); + if (receiver != null) + // Force the receiver to close + receiver.close(); + // receiver.closeWhenFinished(); + + if (queue != null) + queue.close(); + if (uriReferences != null) + uriReferences.close(); + if (knownUriFilter instanceof Closeable) { + ((Closeable) knownUriFilter).close(); + } + workerGuard.shutdown(); + if (frontier != null) + frontier.close(); + super.close(); + LOGGER.info("Frontier Component Closed."); + } + + @Override + public void handleData(byte[] data) { + handleData(data, null, null, null); + } + + @Override + public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { + Object deserializedData; + try { + deserializedData = serializer.deserialize(data); + } catch (IOException e) { + // try to convert the string into a single URI, that maybe comes from the + // WebService + // CrawleableUri uri = new CrawleableUriFactoryImpl().create(new String(data)); + // if (uri != null) { + // LOGGER.warn("Received a single URI " + uri.getUri() + " without a wrapping of + // \"org.aksw.simba.squirrel.rabbit.frontier\". We converted it into a + // UriSet."); + // deserializedData = new UriSet(Collections.singletonList(uri)); + // } else { + LOGGER.error("Error while trying to deserialize incoming data. It will be ignored.", e); + return; + // } + } + + if (deserializedData != null) { + if (deserializedData instanceof UriSetRequest) { + responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); + } else if (deserializedData instanceof UriSet) { + // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); + frontier.addNewUris(((UriSet) deserializedData).uris); + } else if (deserializedData instanceof CrawlingResult) { + CrawlingResult crawlingResult = (CrawlingResult) deserializedData; + LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); + frontier.crawlingDone(crawlingResult.uris); + workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); + } else if (deserializedData instanceof AliveMessage) { + AliveMessage message = (AliveMessage) deserializedData; + String idReceived = message.getWorkerId(); + LOGGER.warn("Received alive message from worker with id " + idReceived); + workerGuard.putNewTimestamp(idReceived); + } else { + LOGGER.warn("Received an unknown object {}. It will be ignored.", deserializedData.toString()); + } + } + } + + private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, + UriSetRequest uriSetRequest) { + if (handler != null) { + // get next UriSet + try { + List uris = frontier.getNextUris(); + LOGGER.trace("Responding with a list of {} uris.", + uris == null ? "null" : Integer.toString(uris.size())); + handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); + if (uris != null && uris.size() > 0) { + hasUrisToCrawl .put(uriSetRequest.getWorkerId(), true); + workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), + uriSetRequest.workerSendsAliveMessages(), uris); + }else { + hasUrisToCrawl .put(uriSetRequest.getWorkerId(), false); + } + } catch (IOException e) { + LOGGER.error("Couldn't serialize new URI set.", e); + } + } else { + LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); + } + } + + protected void processSeedFile(String seedFile) { + try { + List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); + frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); + } catch (Exception e) { + LOGGER.error("Couldn't process seed file. It will be ignored.", e); + } + } + + public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) { + if (frontier instanceof ExtendedFrontier) { + ((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign); + } + } + + public void setFrontier(FrontierImpl frontier) { + this.frontier = frontier; + } + + public WorkerGuard getWorkerGuard() { + return workerGuard; + } + + private class TerminatorTask extends TimerTask{ + + private UriQueue queue; + private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); + private Semaphore terminationMutex; + + public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { + this.queue = queue; + this.terminationMutex = terminationMutex; } @Override public void run() { if(!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { - LOGGER.info(" << FRONTIER IS TERMINATING! >> "); - terminationMutex.release(); - } + LOGGER.info(" << FRONTIER IS TERMINATING! >> "); + terminationMutex.release(); + } } - - } -} \ No newline at end of file + + } +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java index dde2aa4f2..17363a0e5 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java @@ -1,6 +1,8 @@ package org.dice_research.squirrel.configurator; import java.net.URI; +import java.util.ArrayList; +import java.util.List; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; @@ -13,90 +15,149 @@ import org.apache.http.impl.client.AbstractHttpClient; import org.apache.http.protocol.HttpContext; import org.apache.jena.atlas.web.auth.HttpAuthenticator; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.QuerySolution; +import org.apache.jena.query.ResultSet; +import org.apache.jena.rdf.model.RDFNode; import org.apache.jena.sparql.core.DatasetDescription; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; +import org.dice_research.squirrel.sink.tripleBased.AdvancedTripleBasedSink; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SuppressWarnings("deprecation") -public class SparqlConfiguration { - - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConfiguration.class); - - /** - * The Query factory used to query the SPARQL endpoint. - */ - protected static QueryExecutionFactory queryExecFactory = null; - - protected UpdateExecutionFactory updateExecFactory = null; - - - protected SparqlConfiguration(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { - this.queryExecFactory = queryExecFactory; - this.updateExecFactory = updateExecFactory; - } - - public static SparqlConfiguration create(String sparqlEndpointUrl) { - - return create(sparqlEndpointUrl, null, null); - } - - public static SparqlConfiguration create(String sparqlEndpointUrl, String username, String password) { - QueryExecutionFactory queryExecFactory = null; - UpdateExecutionFactory updateExecFactory = null; - if (username != null && password != null) { - // Create the factory with the credentials - final Credentials credentials = new UsernamePasswordCredentials(username, password); - HttpAuthenticator authenticator = new HttpAuthenticator() { - @Override - public void invalidate() { - } - - @Override - public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { - client.setCredentialsProvider(new CredentialsProvider() { - @Override - public void clear() { - } - - @Override - public Credentials getCredentials(AuthScope scope) { - return credentials; - } - - @Override - public void setCredentials(AuthScope arg0, Credentials arg1) { - LOGGER.error("I am a read-only credential provider but got a call to set credentials."); - } - }); - } - }; - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), - authenticator); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); - } else { - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); - } - return new SparqlConfiguration(queryExecFactory, updateExecFactory); - } -//public static void main(String args[]) { -// String sparqlEndpointUrl = "http://localhost:8890/sparql/"; -// SparqlConfiguration.create(sparqlEndpointUrl); -// -// String queryString = "select distinct ?Concept where {[] a ?Concept} LIMIT 100"; -// -// QueryExecution qe = queryExecFactory.createQueryExecution(queryString); -// System.out.println(qe); -// ResultSet rs = qe.execSelect(); -// System.out.println("rs"+rs); -// -// while (rs.hasNext()) { -// QuerySolution sol = rs.nextSolution(); -// RDFNode subject = sol.get("Concept"); -// System.out.println(subject); -// } -// -//} +public class SparqlConfiguration implements AdvancedTripleBasedSink{ + + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConfiguration.class); + /** + * The Query factory used to query the SPARQL endpoint. + */ + protected static QueryExecutionFactory queryExecFactory = null; + + protected UpdateExecutionFactory updateExecFactory = null; + protected static CrawleableUri metadataGraphUri = null; + + + public SparqlConfiguration(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { + this.queryExecFactory = queryExecFactory; + this.updateExecFactory = updateExecFactory; + } + + public static SparqlConfiguration create(String sparqlEndpointUrl) { + + return create(sparqlEndpointUrl, null, null); + } + + public static SparqlConfiguration create(String sparqlEndpointUrl, String username, String password) { + QueryExecutionFactory queryExecFactory = null; + UpdateExecutionFactory updateExecFactory = null; + if (username != null && password != null) { + // Create the factory with the credentials + final Credentials credentials = new UsernamePasswordCredentials(username, password); + HttpAuthenticator authenticator = new HttpAuthenticator() { + @Override + public void invalidate() { + } + + @Override + public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { + client.setCredentialsProvider(new CredentialsProvider() { + @Override + public void clear() { + } + + @Override + public Credentials getCredentials(AuthScope scope) { + return credentials; + } + + @Override + public void setCredentials(AuthScope arg0, Credentials arg1) { + LOGGER.error("I am a read-only credential provider but got a call to set credentials."); + } + }); + } + }; + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), + authenticator); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); + } else { + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); + } + return new SparqlConfiguration(queryExecFactory, updateExecFactory); + } + + public static void main(String args[]) { + + String sparqlEndpointUrl = "http://localhost:8890/sparql"; + SparqlConfiguration.create(sparqlEndpointUrl); + Query selectQuery = FrontierQueryGenerator.getInstance().getTimeStampQuery(); + System.out.println(selectQuery); + + QueryExecution qe = queryExecFactory.createQueryExecution(selectQuery); + ResultSet rs = qe.execSelect(); + List triplesFound = new ArrayList<>(); + while (rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode subject = sol.get("subject"); + RDFNode predicate = sol.get("predicate"); + RDFNode object = sol.get("object"); + triplesFound.add(Triple.create(subject.asNode(), predicate.asNode(), object.asNode())); + System.out.println(subject+" "+ predicate+" "+object); + } + qe.close(); + } + + + + @Override + public List getTriplesForGraph(CrawleableUri uri) { + Query selectQuery = null; + // if (uri.equals(metaDataGraphUri)) { + selectQuery = FrontierQueryGenerator.getInstance().getSelectQuery(); + // } else { + // selectQuery = QueryGenerator.getInstance().getSelectQuery(getGraphId(uri)); + // } + + QueryExecution qe = queryExecFactory.createQueryExecution(selectQuery); + ResultSet rs = qe.execSelect(); + List triplesFound = new ArrayList<>(); + while (rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode subject = sol.get("subject"); + RDFNode predicate = sol.get("predicate"); + RDFNode object = sol.get("object"); + triplesFound.add(Triple.create(subject.asNode(), predicate.asNode(), object.asNode())); + } + qe.close(); + return triplesFound; + } + public static String getGraphId(CrawleableUri uri) { + return Constants.DEFAULT_RESULT_GRAPH_URI_PREFIX + uri.getData(Constants.UUID_KEY).toString(); + } + + + @Override + public void addTriple(CrawleableUri uri, Triple triple) { + // TODO Auto-generated method stub + + } + + @Override + public void openSinkForUri(CrawleableUri uri) { + // TODO Auto-generated method stub + + } + + @Override + public void closeSinkForUri(CrawleableUri uri) { + // TODO Auto-generated method stub + + } } - \ No newline at end of file diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java new file mode 100644 index 000000000..a5ae0e614 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java @@ -0,0 +1,192 @@ +package org.dice_research.squirrel.frontier.impl; +import java.util.Collection; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.jena.graph.Node; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; + +public class FrontierQueryGenerator { + /** + * The instance of the class QueryGenerator. + */ + private static final FrontierQueryGenerator instance = new FrontierQueryGenerator(); + String PREFIX= "PREFIX xsd: "; + @SuppressWarnings("unused") + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); + + private FrontierQueryGenerator() { + } + + /** + * Getter for {@link #instance}. + * + * @return instannce of the class. + */ + public static FrontierQueryGenerator getInstance() { + return instance; + } + + /** + * Return an Add Query for the default uri and its triples. + * + * @param listBufferedTriples the given list of triples. + * @return The generated query. + */ + public String getAddQuery(Collection listBufferedTriples) { + return getAddQuery(null, listBufferedTriples, true); + } + + /** + * Return an Add Query for the given uri and its triples. + * + * @param graphId the graph id where the triples are stored. + * @param listBufferedTriples the given list of triples. + * @return The generated query. + */ + public String getAddQuery(String graphId, Collection listBufferedTriples) { + return getAddQuery(graphId, listBufferedTriples, false); + } + + /** + * Return an Add Query for the given uri or default graph and its triples. + * + * @param graphId the graph id where the triples are stored. + * @param listBufferedTriples the given list of triples. + * @param defaultGraph Identify if query is for the default graph. + * @return The generated query. + */ + public String getAddQuery(String graphId, Collection listBufferedTriples, boolean defaultGraph) { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("INSERT DATA { "); + if (!defaultGraph) { + stringBuilder.append("Graph <"); + stringBuilder.append(graphId); + stringBuilder.append("> { "); + } + for (Triple triple : listBufferedTriples) { + stringBuilder.append(formatNodeToString(triple.getSubject())); + stringBuilder.append(formatNodeToString(triple.getPredicate())); + stringBuilder.append(formatNodeToString(triple.getObject())); + stringBuilder.append(". "); + } + if (!defaultGraph) { + stringBuilder.append("} "); + } + stringBuilder.append("}"); + return stringBuilder.toString(); + } + + /** + * Return a time stamp query for the default graph. + * It will return triples with time stamp contained in the default graph. + * @return All triples with time stamp in the default graph. + */ + + public Query getTimeStampQuery() { + return getTimeStampQuery(null, true); + } + public Query getTimeStampQuery(String graphID, boolean defaultGraph) { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("PREFIX xsd: SELECT ?subject ?predicate ?object WHERE { "); + if (!defaultGraph) { + stringBuilder.append("GRAPH <"); + stringBuilder.append(graphID); + stringBuilder.append("> { "); + } + stringBuilder.append("?subject ?predicate ?object "); + if (!defaultGraph) { + stringBuilder.append("} "); + } + stringBuilder.append("FILTER ( xsd:time(?object))"); + if (!defaultGraph) { + stringBuilder.append("} "); + } + stringBuilder.append("}"); + Query query = QueryFactory.create(stringBuilder.toString()); + return query; + } + + public Query getSelectQuery() { + return getSelectQuery(null, true); + } + /** + * Return a select query for the given graphID or default graph. + * It will return all triples contained in the graph. + * @return All triples contained in the default graph. + * @param graphID The id of the graph from which you want to select. + * @param defaultGraph Identify if query is for the default graph + * @return All triples contained in the graph. + */ + public Query getSelectQuery(String graphID, boolean defaultGraph) { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("SELECT ?subject ?predicate ?object WHERE { "); + if (!defaultGraph) { + stringBuilder.append("GRAPH <"); + stringBuilder.append(graphID); + stringBuilder.append("> { "); + } + stringBuilder.append("?subject ?predicate ?object "); + if (!defaultGraph) { + stringBuilder.append("} "); + } + stringBuilder.append("}"); + Query query = QueryFactory.create(stringBuilder.toString()); + return query; + } + + /** + * Return a select query for the given graphID. + * It will return all triples contained in the graph. + * @param graphID The id of the graph from which you want to select. + * @return All triples contained in the graph. + */ + public Query getSelectQuery(String graphID) { + return getSelectQuery(graphID, false); + } + + /** + * Formats the node for a query + * + * @param node The node which should formated + * @return a robust representation of the node + *

+ * Note: Should be updated in relation to the robustness of parsing. + */ + public static String formatNodeToString(Node node) { + StringBuilder stringBuilder = new StringBuilder(); + if (node.isURI()) { + stringBuilder.append("<"); + //Should possibly be further improved + stringBuilder.append(node.getURI().replace(" ","")); + stringBuilder.append(">"); + } else if (node.isBlank()) { + stringBuilder.append("_:"); + //Should possibly be further improved + String label = node.getBlankNodeId().getLabelString().replace(":", ""); + if (label.startsWith("-")) { + label = label.substring(1); + } + stringBuilder.append(label); + } else if (node.isLiteral()) { + stringBuilder.append("\""); + //Should possibly be further improved + stringBuilder.append(node.getLiteral().getLexicalForm().replace("\n", "").replace("\"", "'").replace("\r", "")); + stringBuilder.append("\""); + if (node.getLiteralLanguage() != null && !node.getLiteralLanguage().isEmpty()) { + stringBuilder.append("@"); + stringBuilder.append(node.getLiteralLanguage()); + } else if (node.getLiteralDatatype() != null) { + stringBuilder.append("^^"); + stringBuilder.append("<"); + stringBuilder.append(node.getLiteralDatatype().getURI()); + stringBuilder.append(">"); + } + } + stringBuilder.append(" "); + return stringBuilder.toString(); + } +} From 8b7f33f441a0f9272413fb4f2e0e41ffd7199e38 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 9 Jun 2019 16:03:13 +0200 Subject: [PATCH 005/102] minor changes in files to get correct triples from sparqlendpoint and changes in SPARQL query to retreive endTimeStamps --- docker-compose-sparql.yml | 16 ++---- spring-config/worker-context.xml | 4 +- .../configurator/SparqlConfiguration.java | 56 ++++++++++++++++--- .../frontier/impl/FrontierQueryGenerator.java | 12 ++-- 4 files changed, 63 insertions(+), 25 deletions(-) diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 8ce9dbd0f..663419a6e 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -94,8 +94,6 @@ services: - SPARQL_HOST_USER=dba - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - JVM_ARGS=-Xmx8g volumes: - ./data/worker1:/var/squirrel/data @@ -115,8 +113,6 @@ services: - SPARQL_HOST_USER=dba - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=false - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - JVM_ARGS=-Xmx8g volumes: - ./data/worker2:/var/squirrel/data @@ -136,8 +132,6 @@ services: - SPARQL_HOST_USER=dba - SPARQL_HOST_PASSWD=pw123 - DEDUPLICATION_ACTIVE=true - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - JVM_ARGS=-Xmx8g volumes: - ./data/worker3:/var/squirrel/data @@ -146,17 +140,17 @@ services: command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter # deduplicator: -# image: squirrel +# image: squirrel.deduplication # container_name: deduplicator # environment: # DEDUPLICATION_ACTIVE: "true" # HOBBIT_RABBIT_HOST: rabbit -# OUTPUT_FOLDER: /var/squirrel/data +# OUTPUT_FOLDER: /var/squirrel/data +# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml # MDB_HOST_NAME: mongodb # MDB_PORT: 27017 # SPARQL_HOST_NAME: sparqlhost # SPARQL_HOST_PORT: 3030 -# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" +# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" # volumes: -# - ./data/deduplicator:/var/squirrel/data -# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent \ No newline at end of file +# - ./data/deduplicator:/var/squirrel/data diff --git a/spring-config/worker-context.xml b/spring-config/worker-context.xml index 697778d19..5d60e003c 100644 --- a/spring-config/worker-context.xml +++ b/spring-config/worker-context.xml @@ -45,9 +45,11 @@ - + - + + + + + diff --git a/spring-config/worker-context.xml b/spring-config/worker-context.xml index 4d65af297..f27c706ee 100644 --- a/spring-config/worker-context.xml +++ b/spring-config/worker-context.xml @@ -45,9 +45,19 @@ +<<<<<<< HEAD +======= + + + + - + - - - - + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java similarity index 87% rename from squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java index 21d7835db..7bf589d25 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java @@ -4,14 +4,14 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; -public interface KnownOutDatedUriFilter { +public interface OutDatedUris { + - /** * Returns all {@link CrawleableUri}s which have to be recrawled. This means their time to next crawl has passed. * * @return The outdated {@link CrawleableUri}s. */ public List getUriToRecrawl(); - -} \ No newline at end of file + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 9c4c3ce2c..4c904f9a4 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -1,31 +1,15 @@ package org.dice_research.squirrel.components; -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Timer; -import java.util.TimerTask; -import java.util.concurrent.Semaphore; - import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; -import org.apache.jena.base.Sys; import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.configurator.MongoConfiguration; -import org.dice_research.squirrel.configurator.SeedConfiguration; -import org.dice_research.squirrel.configurator.SparqlConfiguration; -import org.dice_research.squirrel.configurator.WebConfiguration; -import org.dice_research.squirrel.configurator.WhiteListConfiguration; +import org.dice_research.squirrel.configurator.*; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -33,12 +17,7 @@ import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; import org.dice_research.squirrel.frontier.ExtendedFrontier; import org.dice_research.squirrel.frontier.Frontier; -import org.dice_research.squirrel.frontier.impl.ExtendedFrontierImpl; -import org.dice_research.squirrel.frontier.impl.FrontierImpl; -import org.dice_research.squirrel.frontier.impl.FrontierSenderToWebservice; -import org.dice_research.squirrel.frontier.impl.QueueBasedTerminationCheck; -import org.dice_research.squirrel.frontier.impl.TerminationCheck; -import org.dice_research.squirrel.frontier.impl.WorkerGuard; +import org.dice_research.squirrel.frontier.impl.*; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.rabbit.RPCServer; @@ -56,254 +35,245 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.Semaphore; @Component @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); - - @Qualifier("queueBean") - @Autowired - protected UriQueue queue; - @Qualifier("knowUriFilterBean") - @Autowired - private KnownUriFilter knownUriFilter; - private KnownOutDatedUriFilter knownOutDatedUriFilter; - private URIReferences uriReferences = null; - private Frontier frontier; - private RabbitQueue rabbitQueue; - private DataReceiver receiver; - @Qualifier("serializerBean") - @Autowired - private Serializer serializer; - private final Semaphore terminationMutex = new Semaphore(0); - private final WorkerGuard workerGuard = new WorkerGuard(this); - private final boolean doRecrawling = true; - private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; - - private Map hasUrisToCrawl; - - public static final boolean RECRAWLING_ACTIVE = true; - protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; - - protected static QueryExecutionFactory queryExecFactory = null; - - protected UpdateExecutionFactory updateExecFactory = null; - - - - - @Override - public void init() throws Exception { - super.init(); - serializer = new GzipJavaUriSerializer(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - SparqlConfiguration sp = SparqlConfiguration.create("http://localhost:8890/sparql-auth","dba","pw123"); - hasUrisToCrawl = new HashMap(); - - - if (mongoConfiguration != null) { - - queue.open(); - knownUriFilter.open(); - - - WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); - if (whiteListConfiguration != null) { - File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); - knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); - } - - // TODO Reactivate me but with a different configuration - // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { - // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); - // uriReferences.open(); - // } - } else { - LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); - queue = new InMemoryQueue(); - knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); - } - - // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(),knownUriFilter, uriReferences, queue, doRecrawling, knownOutDatedUriFilter); - rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); - receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) - .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); - - SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); - if (seedConfiguration != null) { - processSeedFile(seedConfiguration.getSeedFile()); - } - - LOGGER.info("Frontier initialized."); - - if (webConfiguration.isCommunicationWithWebserviceEnabled()) { - final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, - workerGuard, queue, knownUriFilter, uriReferences); - LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " - + webConfiguration.isVisualizationOfCrawledGraphEnabled()); - Thread senderThread = new Thread(sender); - senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); - senderThread.start(); - LOGGER.info("Started thread [" + senderThread.getName() + "] "); - } else { - LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " - + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" - + webConfiguration.isVisualizationOfCrawledGraphEnabled() - + ". No WebServiceSenderThread will be started!"); - } - - } - - - - @Override - public void run() throws Exception { - TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); - Timer timer = new Timer(); - timer.schedule(terminatorTask, 5000,5000); - terminationMutex.acquire(); - timer.cancel(); - } - - @Override - public void close() throws IOException { - LOGGER.info("Closing Frontier Component."); - if (receiver != null) - // Force the receiver to close - receiver.close(); - // receiver.closeWhenFinished(); - - if (queue != null) - queue.close(); - if (uriReferences != null) - uriReferences.close(); - if (knownUriFilter instanceof Closeable) { - ((Closeable) knownUriFilter).close(); - } - workerGuard.shutdown(); - if (frontier != null) - frontier.close(); - super.close(); - LOGGER.info("Frontier Component Closed."); - } - - @Override - public void handleData(byte[] data) { - handleData(data, null, null, null); - } - - @Override - public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { - Object deserializedData; - try { - deserializedData = serializer.deserialize(data); - } catch (IOException e) { - // try to convert the string into a single URI, that maybe comes from the - // WebService - // CrawleableUri uri = new CrawleableUriFactoryImpl().create(new String(data)); - // if (uri != null) { - // LOGGER.warn("Received a single URI " + uri.getUri() + " without a wrapping of - // \"org.aksw.simba.squirrel.rabbit.frontier\". We converted it into a - // UriSet."); - // deserializedData = new UriSet(Collections.singletonList(uri)); - // } else { - LOGGER.error("Error while trying to deserialize incoming data. It will be ignored.", e); - return; - // } - } - - if (deserializedData != null) { - if (deserializedData instanceof UriSetRequest) { - responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); - } else if (deserializedData instanceof UriSet) { - // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); - frontier.addNewUris(((UriSet) deserializedData).uris); - } else if (deserializedData instanceof CrawlingResult) { - CrawlingResult crawlingResult = (CrawlingResult) deserializedData; - LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); - frontier.crawlingDone(crawlingResult.uris); - workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); - } else if (deserializedData instanceof AliveMessage) { - AliveMessage message = (AliveMessage) deserializedData; - String idReceived = message.getWorkerId(); - LOGGER.warn("Received alive message from worker with id " + idReceived); - workerGuard.putNewTimestamp(idReceived); - } else { - LOGGER.warn("Received an unknown object {}. It will be ignored.", deserializedData.toString()); - } - } - } - - private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, - UriSetRequest uriSetRequest) { - if (handler != null) { - // get next UriSet - try { - List uris = frontier.getNextUris(); - LOGGER.trace("Responding with a list of {} uris.", - uris == null ? "null" : Integer.toString(uris.size())); - handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); - if (uris != null && uris.size() > 0) { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), true); - workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), - uriSetRequest.workerSendsAliveMessages(), uris); - }else { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), false); - } - } catch (IOException e) { - LOGGER.error("Couldn't serialize new URI set.", e); - } - } else { - LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); - } - } - - protected void processSeedFile(String seedFile) { - try { - List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); - frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); - } catch (Exception e) { - LOGGER.error("Couldn't process seed file. It will be ignored.", e); - } - } - - public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) { - if (frontier instanceof ExtendedFrontier) { - ((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign); - } - } - - public void setFrontier(FrontierImpl frontier) { - this.frontier = frontier; - } - - public WorkerGuard getWorkerGuard() { - return workerGuard; - } - - private class TerminatorTask extends TimerTask{ - - private UriQueue queue; - private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); - private Semaphore terminationMutex; - - public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { - this.queue = queue; - this.terminationMutex = terminationMutex; - } - - @Override - public void run() { - if(!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { - LOGGER.info(" << FRONTIER IS TERMINATING! >> "); - terminationMutex.release(); - } - } - - } + public static final boolean RECRAWLING_ACTIVE = true; + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); + protected static QueryExecutionFactory queryExecFactory = null; + private final Semaphore terminationMutex = new Semaphore(0); + private final WorkerGuard workerGuard = new WorkerGuard(this); + private final boolean doRecrawling = true; + @Qualifier("queueBean") + @Autowired + protected UriQueue queue; + protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; + protected UpdateExecutionFactory updateExecFactory = null; + @Qualifier("knowUriFilterBean") + @Autowired + private KnownUriFilter knownUriFilter; + private OutDatedUris outDatedUris; + private URIReferences uriReferences = null; + private Frontier frontier; + private RabbitQueue rabbitQueue; + private DataReceiver receiver; + @Qualifier("serializerBean") + @Autowired + private Serializer serializer; + private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; + private Map hasUrisToCrawl; + + @Override + public void init() throws Exception { + super.init(); + serializer = new GzipJavaUriSerializer(); + MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); + SparqlConnector sp = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + hasUrisToCrawl = new HashMap(); + if (mongoConfiguration != null) { + + queue.open(); + knownUriFilter.open(); + WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); + if (whiteListConfiguration != null) { + File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); + knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); + } + // TODO Reactivate me but with a different configuration + // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { + // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); + // uriReferences.open(); + // } + } else { + LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); + queue = new InMemoryQueue(); + knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); + } + // Build frontier + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUris); + rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); + receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) + .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); + + SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); + if (seedConfiguration != null) { + processSeedFile(seedConfiguration.getSeedFile()); + } + + LOGGER.info("Frontier initialized."); + + if (webConfiguration.isCommunicationWithWebserviceEnabled()) { + final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, + workerGuard, queue, knownUriFilter, uriReferences); + LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " + + webConfiguration.isVisualizationOfCrawledGraphEnabled()); + Thread senderThread = new Thread(sender); + senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); + senderThread.start(); + LOGGER.info("Started thread [" + senderThread.getName() + "] "); + } else { + LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " + + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" + + webConfiguration.isVisualizationOfCrawledGraphEnabled() + + ". No WebServiceSenderThread will be started!"); + } + + } + + + @Override + public void run() throws Exception { + TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); + Timer timer = new Timer(); + timer.schedule(terminatorTask, 5000, 5000); + terminationMutex.acquire(); + timer.cancel(); + } + + @Override + public void close() throws IOException { + LOGGER.info("Closing Frontier Component."); + if (receiver != null) + // Force the receiver to close + receiver.close(); + // receiver.closeWhenFinished(); + + if (queue != null) + queue.close(); + if (uriReferences != null) + uriReferences.close(); + if (knownUriFilter instanceof Closeable) { + ((Closeable) knownUriFilter).close(); + } + workerGuard.shutdown(); + if (frontier != null) + frontier.close(); + super.close(); + LOGGER.info("Frontier Component Closed."); + } + + @Override + public void handleData(byte[] data) { + handleData(data, null, null, null); + } + + @Override + public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { + Object deserializedData; + try { + deserializedData = serializer.deserialize(data); + } catch (IOException e) { + // try to convert the string into a single URI, that maybe comes from the + // WebService + // CrawleableUri uri = new CrawleableUriFactoryImpl().create(new String(data)); + // if (uri != null) { + // LOGGER.warn("Received a single URI " + uri.getUri() + " without a wrapping of + // \"org.aksw.simba.squirrel.rabbit.frontier\". We converted it into a + // UriSet."); + // deserializedData = new UriSet(Collections.singletonList(uri)); + // } else { + LOGGER.error("Error while trying to deserialize incoming data. It will be ignored.", e); + return; + // } + } + + if (deserializedData != null) { + if (deserializedData instanceof UriSetRequest) { + responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); + } else if (deserializedData instanceof UriSet) { + // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); + frontier.addNewUris(((UriSet) deserializedData).uris); + } else if (deserializedData instanceof CrawlingResult) { + CrawlingResult crawlingResult = (CrawlingResult) deserializedData; + LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); + frontier.crawlingDone(crawlingResult.uris); + workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); + } else if (deserializedData instanceof AliveMessage) { + AliveMessage message = (AliveMessage) deserializedData; + String idReceived = message.getWorkerId(); + LOGGER.warn("Received alive message from worker with id " + idReceived); + workerGuard.putNewTimestamp(idReceived); + } else { + LOGGER.warn("Received an unknown object {}. It will be ignored.", deserializedData.toString()); + } + } + } + + private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, + UriSetRequest uriSetRequest) { + if (handler != null) { + // get next UriSet + try { + List uris = frontier.getNextUris(); + LOGGER.trace("Responding with a list of {} uris.", + uris == null ? "null" : Integer.toString(uris.size())); + handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); + if (uris != null && uris.size() > 0) { + hasUrisToCrawl.put(uriSetRequest.getWorkerId(), true); + workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), + uriSetRequest.workerSendsAliveMessages(), uris); + } else { + hasUrisToCrawl.put(uriSetRequest.getWorkerId(), false); + } + } catch (IOException e) { + LOGGER.error("Couldn't serialize new URI set.", e); + } + } else { + LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); + } + } + + protected void processSeedFile(String seedFile) { + try { + List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); + frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); + } catch (Exception e) { + LOGGER.error("Couldn't process seed file. It will be ignored.", e); + } + } + + public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) { + if (frontier instanceof ExtendedFrontier) { + ((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign); + } + } + + public void setFrontier(FrontierImpl frontier) { + this.frontier = frontier; + } + + public WorkerGuard getWorkerGuard() { + return workerGuard; + } + + private class TerminatorTask extends TimerTask { + + private UriQueue queue; + private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); + private Semaphore terminationMutex; + + public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { + this.queue = queue; + this.terminationMutex = terminationMutex; + } + + @Override + public void run() { + if (!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { + LOGGER.info(" << FRONTIER IS TERMINATING! >> "); + terminationMutex.release(); + } + } + + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java index e8499af29..f769cd7db 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java @@ -12,45 +12,41 @@ /** * This is the main method creating and starting an instance of a * {@link Component} with the given class name. - * */ public class FrontierComponentStarter { - + private static final int ERROR_EXIT_CODE = 1; - + private static FileSystemXmlApplicationContext context; private static Component component; private static boolean closed = false; - - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponentStarter.class); + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponentStarter.class); - public static void main(String[] args) { addShutdownHook(); boolean success = true; try { - context = new FileSystemXmlApplicationContext(File.separator + System.getenv("FRONTIER_CONTEXT_CONFIG_FILE")); - component = (Component) context.getBean("frontierComponent"); - component.init(); - - component.run(); + context = new FileSystemXmlApplicationContext(File.separator + System.getenv("FRONTIER_CONTEXT_CONFIG_FILE")); + component = (Component) context.getBean("frontierComponent"); + component.init(); + + component.run(); } catch (Throwable t) { LOGGER.error("Exception while executing component. Exiting with error code.", t); success = false; } finally { closeComponent(); } - if (!success) { System.exit(ERROR_EXIT_CODE); } } - + private static synchronized void closeComponent() { if (!closed) { Closer.close(component, LOGGER); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java deleted file mode 100644 index 8139c33db..000000000 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.dice_research.squirrel.configurator; - -import java.net.URI; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.List; - -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryHttp; -import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.Credentials; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.AbstractHttpClient; -import org.apache.http.protocol.HttpContext; -import org.apache.jena.atlas.web.auth.HttpAuthenticator; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.Query; -import org.apache.jena.query.QueryExecution; -import org.apache.jena.query.QuerySolution; -import org.apache.jena.query.ResultSet; -import org.apache.jena.rdf.model.RDFNode; -import org.apache.jena.sparql.core.DatasetDescription; -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@SuppressWarnings("deprecation") -public class SparqlConfiguration implements KnownOutDatedUriFilter { - - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConfiguration.class); - - /** - * The Query factory used to query the SPARQL endpoint. - */ - protected static QueryExecutionFactory queryExecFactory = null; - protected UpdateExecutionFactory updateExecFactory = null; - List urisToRecrawl = new ArrayList<>(); - - public SparqlConfiguration(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { - this.queryExecFactory = queryExecFactory; - this.updateExecFactory = updateExecFactory; - LOGGER.info("Connected"); - } - - public static SparqlConfiguration create(String sparqlEndpointUrl) { - - return create(sparqlEndpointUrl, null, null); - } - - public static SparqlConfiguration create(String sparqlEndpointUrl, String username, String password) { - QueryExecutionFactory queryExecFactory = null; - UpdateExecutionFactory updateExecFactory = null; - if (username != null && password != null) { - // Create the factory with the credentials - final Credentials credentials = new UsernamePasswordCredentials(username, password); - HttpAuthenticator authenticator = new HttpAuthenticator() { - @Override - public void invalidate() { - } - - @Override - public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { - client.setCredentialsProvider(new CredentialsProvider() { - @Override - public void clear() { - } - - @Override - public Credentials getCredentials(AuthScope scope) { - return credentials; - } - - @Override - public void setCredentials(AuthScope arg0, Credentials arg1) { - LOGGER.error("I am a read-only credential provider but got a call to set credentials."); - } - }); - } - }; - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), - authenticator); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); - } else { - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); - } - return new SparqlConfiguration(queryExecFactory, updateExecFactory); - } - - - @Override - public List getUriToRecrawl() { - SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); - System.out.println(getOutdatedUrisQuery); - QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); - ResultSet rs = qe.execSelect(); - while (rs.hasNext()) { - QuerySolution sol = rs.nextSolution(); - RDFNode outdatedUri = sol.get("uri"); - try { - urisToRecrawl.add(new CrawleableUri(new URI((outdatedUri.toString())))); - } catch (URISyntaxException e) { - e.printStackTrace(); - } - }qe.close(); - return urisToRecrawl; -} - - - -} - - diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java new file mode 100644 index 000000000..02afa0b07 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java @@ -0,0 +1,118 @@ +package org.dice_research.squirrel.configurator; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryHttp; +import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.Credentials; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CredentialsProvider; +import org.apache.http.impl.client.AbstractHttpClient; +import org.apache.http.protocol.HttpContext; +import org.apache.jena.atlas.web.auth.HttpAuthenticator; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.QuerySolution; +import org.apache.jena.query.ResultSet; +import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.sparql.core.DatasetDescription; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; +import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressWarnings("deprecation") +public class SparqlConnector implements OutDatedUris { + + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConnector.class); + + /** + * The Query factory used to query the SPARQL endpoint. + */ + protected static QueryExecutionFactory queryExecFactory = null; + protected UpdateExecutionFactory updateExecFactory = null; + List urisToRecrawl = new ArrayList<>(); + + public SparqlConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { + this.queryExecFactory = queryExecFactory; + this.updateExecFactory = updateExecFactory; + LOGGER.info("Connected"); + } + + public static SparqlConnector create(String sparqlEndpointUrl) { + return create(sparqlEndpointUrl, null, null); + } + + public static SparqlConnector create(String sparqlEndpointUrl, String username, String password) { + QueryExecutionFactory queryExecFactory = null; + UpdateExecutionFactory updateExecFactory = null; + if (username != null && password != null) { + // Create the factory with the credentials + final Credentials credentials = new UsernamePasswordCredentials(username, password); + HttpAuthenticator authenticator = new HttpAuthenticator() { + @Override + public void invalidate() { + } + + @Override + public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { + client.setCredentialsProvider(new CredentialsProvider() { + @Override + public void clear() { + } + + @Override + public Credentials getCredentials(AuthScope scope) { + return credentials; + } + + @Override + public void setCredentials(AuthScope arg0, Credentials arg1) { + LOGGER.error("I am a read-only credential provider but got a call to set credentials."); + } + }); + } + }; + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), + authenticator); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); + } else { + queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); + updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); + } + return new SparqlConnector(queryExecFactory, updateExecFactory); + } + + + @Override + public List getUriToRecrawl() { + SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); + System.out.println(getOutdatedUrisQuery); + QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); + ResultSet rs = qe.execSelect(); + while (rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode outdatedUri = sol.get("uri"); + try { + urisToRecrawl.add(new CrawleableUri(new URI((outdatedUri.toString())))); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + } + qe.close(); + return urisToRecrawl; + } + + +} + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index f9a8e022c..4891e4400 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -21,7 +21,6 @@ import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoClientOptions; @@ -33,15 +32,13 @@ import com.mongodb.client.model.Indexes; /** - * * Filter implementation for use with MongoDB - * + *

* * @author Geraldo Souza Junior (gsjunior@mail.uni-paderborn.de) - * */ @SuppressWarnings("deprecation") -public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable,UriHashCustodian { +public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable, UriHashCustodian { private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); @@ -50,7 +47,6 @@ public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeabl public static final String DB_NAME = "squirrel"; private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds public static final String COLLECTION_NAME = "knownurifilter"; - public static final String COLUMN_TIMESTAMP_LAST_CRAWL = "timestampLastCrawl"; public static final String COLUMN_URI = "uri"; public static final String COLUMN_CRAWLING_IN_PROCESS = "crawlingInProcess"; @@ -65,32 +61,24 @@ public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeabl private static final String DUMMY_HASH_VALUE = "dummyValue"; public MongoDBKnowUriFilter(String hostName, Integer port) { - LOGGER.info("Filter Persistance: " + PERSIST); - - - MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); + MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - - if(mongoConfiguration != null &&(mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { - optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); - optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); - optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); - - MongoClientOptions options = optionsBuilder.build(); - - client = new MongoClient(new ServerAddress(hostName, port),options); - - }else { - client = new MongoClient(hostName, port); - } + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { + optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); + optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); + optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); + MongoClientOptions options = optionsBuilder.build(); + client = new MongoClient(new ServerAddress(hostName, port), options); + } else { + client = new MongoClient(hostName, port); + } } @Override public boolean isUriGood(CrawleableUri uri) { MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) - .find(new Document("uri", uri.getUri().toString())).iterator(); - + .find(new Document("uri", uri.getUri().toString())).iterator(); if (cursor.hasNext()) { LOGGER.debug("URI {} is not good", uri.toString()); Document doc = cursor.next(); @@ -111,20 +99,18 @@ public boolean isUriGood(CrawleableUri uri) { @Override public void add(CrawleableUri uri, long nextCrawlTimestamp) { - add(uri, System.currentTimeMillis(), nextCrawlTimestamp); + add(uri, System.currentTimeMillis(), nextCrawlTimestamp); } public Document crawleableUriToMongoDocument(CrawleableUri uri) { - UriType uriType = uri.getType(); - return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); } @Override public void close() throws IOException { - if(!PERSIST) { + if (!PERSIST) { mongoDB.getCollection(COLLECTION_NAME).drop(); } @@ -153,38 +139,37 @@ public boolean knowUriTableExists() { @Override public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp) { - mongoDB.getCollection(COLLECTION_NAME) - .insertOne(crawleableUriToMongoDocument(uri) - .append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) - .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) - .append(COLUMN_CRAWLING_IN_PROCESS, false) - .append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE) - ); - LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); + mongoDB.getCollection(COLLECTION_NAME) + .insertOne(crawleableUriToMongoDocument(uri) + .append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) + .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) + .append(COLUMN_CRAWLING_IN_PROCESS, false) + .append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE) + ); + LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); } - + @Override public void addHashValuesForUris(List uris) { } - - + public void purge() { - mongoDB.getCollection(COLLECTION_NAME).drop(); + mongoDB.getCollection(COLLECTION_NAME).drop(); } @Override public List getOutdatedUris() { - // get all uris with the following property: + // get all uris with the following property: // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - - long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); - Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), - Filters.or( - Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), - Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3) - )); + long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); + + Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), + Filters.or( + Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), + Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3) + )); Iterator uriDocs = mongoDB.getCollection(COLLECTION_NAME).find(filter).iterator(); @@ -204,14 +189,11 @@ public List getOutdatedUris() { // mark that the uris are in process now for (CrawleableUri uri : urisToRecrawl) { - - BasicDBObject newDocument = new BasicDBObject(); - newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); - - BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); - - mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); - + BasicDBObject newDocument = new BasicDBObject(); + newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); + BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); + mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); + } // cursor.close(); @@ -224,10 +206,10 @@ public long count() { return 0; } - @Override - public Set getUrisWithSameHashValues(Set hashValuesForComparison) { - // TODO Auto-generated method stub - return null; - } + @Override + public Set getUrisWithSameHashValues(Set hashValuesForComparison) { + // TODO Auto-generated method stub + return null; + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index 0ebc5d286..0427ff5f1 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -6,8 +6,8 @@ import java.util.Set; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -33,8 +33,8 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti */ @SuppressWarnings("unused") public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, KnownOutDatedUriFilter knownOutDatedUriFilter) { - super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, knownOutDatedUriFilter); + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUris outDatedUris) { + super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUris); } /** @@ -47,7 +47,7 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUris knownOutDatedUriFilter) { super(normalizer, knownUriFilter, queue, doesRecrawling, knownOutDatedUriFilter); } @@ -62,7 +62,7 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUris knownOutDatedUriFilter) { super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling,knownOutDatedUriFilter); } @@ -79,4 +79,4 @@ public void informAboutDeadWorker(String idOfWorker, List lstUris setIps.forEach(ip -> ipQueue.markIpAddressAsAccessible(ip)); } } -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 0981cb943..6ddc26297 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -7,11 +7,10 @@ import java.util.Set; import java.util.Timer; import java.util.TimerTask; - import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; @@ -33,272 +32,269 @@ */ public class FrontierImpl implements Frontier { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - - /** - * {@link UriNormalizer} used to transform given URIs into a normal form. - */ - protected UriNormalizer normalizer; - - /** - * {@link KnownUriFilter} used to identify URIs that already have been crawled. - */ - protected KnownUriFilter knownUriFilter; - - protected KnownOutDatedUriFilter knownOutDatedUriFilter; - - /** - * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to - * identify URIs that already have been crawled. - */ - protected URIReferences uriReferences = null; - - /** - * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. - */ - protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); - /** - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - protected UriQueue queue; - /** - * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, - * SPARQL, DEREFERENCEABLE or UNKNOWN - */ - protected UriProcessor uriProcessor; - /** - * {@link GraphLogger} that can be added to log the crawled graph. - */ - protected GraphLogger graphLogger; - - /** - * Indicates whether recrawling is active. - */ - private boolean doesRecrawling; - - /** - * The timer that schedules the recrawling. - */ - private Timer timerRecrawling; - - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ - private static long generalRecrawlTime; - - /** - * Time interval(in milliseconds) at which the check for outdated uris is - * performed. - */ - private long timerPeriod; - - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME =18000 ; + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); + + /** + * {@link UriNormalizer} used to transform given URIs into a normal form. + */ + protected UriNormalizer normalizer; + + /** + * {@link KnownUriFilter} used to identify URIs that already have been crawled. + */ + protected KnownUriFilter knownUriFilter; + + protected OutDatedUris outDatedUris; + + /** + * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to + * identify URIs that already have been crawled. + */ + protected URIReferences uriReferences = null; + + /** + * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. + */ + protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); + /** + * {@link UriQueue} used to manage the URIs that should be crawled. + */ + protected UriQueue queue; + /** + * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, + * SPARQL, DEREFERENCEABLE or UNKNOWN + */ + protected UriProcessor uriProcessor; + /** + * {@link GraphLogger} that can be added to log the crawled graph. + */ + protected GraphLogger graphLogger; + + /** + * Indicates whether recrawling is active. + */ + private boolean doesRecrawling; + + /** + * The timer that schedules the recrawling. + */ + private Timer timerRecrawling; + + /** + * Time (in milliseconds) after which uris will be recrawled (only used if no + * specific time is configured for a URI). + */ + private static long generalRecrawlTime; + + /** + * Time interval(in milliseconds) at which the check for outdated uris is + * performed. + */ + private long timerPeriod; + + /** + * Default value for {@link #generalRecrawlTime} (one week). + */ + public static final long DEFAULT_GENERAL_RECRAWL_TIME =18000 ; /** * Default value for {@link #timerPeriod}. */ private static final long DEFAULT_TIMER_PERIOD = 18000; - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod,KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod,knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod, UriHashCustodian uriHashCustodian, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, - timerPeriod, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling,KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - * @param knownOutDatedUriFilter - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, - boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this.normalizer = normalizer; - this.knownUriFilter = knownUriFilter; - this.uriReferences = uriReferences; - this.queue = queue; - this.uriProcessor = new UriProcessor(); - this.graphLogger = graphLogger; - this.knownOutDatedUriFilter = knownOutDatedUriFilter; - - this.queue.open(); - this.doesRecrawling = doesRecrawling; - this.timerPeriod = timerPeriod; - FrontierImpl.generalRecrawlTime = generalRecrawlTime; - - if (this.doesRecrawling) { - timerRecrawling = new Timer(); - timerRecrawling.schedule(new TimerTask() { - @Override - public void run() { - List urisToRecrawl = knownOutDatedUriFilter.getUriToRecrawl(); - System.out.println("Frontier uri to recrawl: " +urisToRecrawl); - urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); - } - }, this.timerPeriod, this.timerPeriod); - } - } - - @Override - public List getNextUris() { + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, + long generalRecrawlTime, long timerPeriod,OutDatedUris outDatedUris) { + this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, + generalRecrawlTime, timerPeriod,outDatedUris); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, + KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, + long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUris knownOutDatedUriFilter) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, + timerPeriod, knownOutDatedUriFilter); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, + KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling,OutDatedUris outDatedUris) { + this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUris); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, + KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUris outDatedUris) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUris); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + */ + public FrontierImpl(UriNormalizer normalizer, + KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUris outDatedUris) { + this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD, outDatedUris); + } + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + * @param outDatedUris + */ + public FrontierImpl(UriNormalizer normalizer, + KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, + boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUris outDatedUris) { + this.normalizer = normalizer; + this.knownUriFilter = knownUriFilter; + this.uriReferences = uriReferences; + this.queue = queue; + this.uriProcessor = new UriProcessor(); + this.graphLogger = graphLogger; + this.outDatedUris = outDatedUris; + + this.queue.open(); + this.doesRecrawling = doesRecrawling; + this.timerPeriod = timerPeriod; + FrontierImpl.generalRecrawlTime = generalRecrawlTime; + + if (this.doesRecrawling) { + timerRecrawling = new Timer(); + timerRecrawling.schedule(new TimerTask() { + @Override + public void run() { + List urisToRecrawl = outDatedUris.getUriToRecrawl(); + System.out.println("Frontier uri to recrawl: " +urisToRecrawl); + urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); + } + }, this.timerPeriod, this.timerPeriod); + } + } + + @Override + public List getNextUris() { // if(terminationCheck.shouldFrontierTerminate(this)) { // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); // } - - return queue.getNextUris(); - } - - @Override - public void addNewUris(List uris) { - for (CrawleableUri uri : uris) { - addNewUri(uri); - } - } - - @Override - public void addNewUri(CrawleableUri uri) { - // Normalize the URI - uri = normalizer.normalize(uri); - // After knownUriFilter uri should be classified according to - // UriProcessor - - if (knownUriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); - if (schemeUriFilter.isUriGood(uri)) { - LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); - // Make sure that the IP is known - try { - uri = this.uriProcessor.recognizeInetAddress(uri); - - } catch (UnknownHostException e) { - LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); - } - if (uri.getIpAddress() != null) { - queue.addUri(this.uriProcessor.recognizeUriType(uri)); - } else { - LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); - } - knownUriFilter.add(uri, System.currentTimeMillis()); - } else { - LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " - + schemeUriFilter.getSchemes() + ". Will not added!"); - } - - } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!"); - } - } - - @Override - public void crawlingDone(List uris) { - LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); - + return queue.getNextUris(); + } + + @Override + public void addNewUris(List uris) { + for (CrawleableUri uri : uris) { + addNewUri(uri); + } + } + + @Override + public void addNewUri(CrawleableUri uri) { + // Normalize the URI + uri = normalizer.normalize(uri); + // After knownUriFilter uri should be classified according to + // UriProcessor + + if (knownUriFilter.isUriGood(uri)) { + LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); + if (schemeUriFilter.isUriGood(uri)) { + LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); + // Make sure that the IP is known + try { + uri = this.uriProcessor.recognizeInetAddress(uri); + + } catch (UnknownHostException e) { + LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); + } + if (uri.getIpAddress() != null) { + queue.addUri(this.uriProcessor.recognizeUriType(uri)); + } else { + LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); + } + knownUriFilter.add(uri, System.currentTimeMillis()); + } else { + LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + + schemeUriFilter.getSchemes() + ". Will not added!"); + } + + } else { + LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!"); + } + } + + @Override + public void crawlingDone(List uris) { + LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); // List newUris = new ArrayList<>(uriMap.size()); // for (CrawleableUri uri : uriMap.keySet()) { // newUris.addAll(uriMap.get(uri)); @@ -312,64 +308,64 @@ public void crawlingDone(List uris) { // if (graphLogger != null) { // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); // } - // If we should give the crawled IPs to the queue - if (queue instanceof IpAddressBasedQueue) { - Set ips = new HashSet<>(); - InetAddress ip; - for (CrawleableUri uri : uris) { - ip = uri.getIpAddress(); - if (ip != null) { - ips.add(ip); - } - } - ips.forEach(_ip -> ((IpAddressBasedQueue) queue).markIpAddressAsAccessible(_ip)); - } - // send list of crawled URIs to the knownUriFilter - for (CrawleableUri uri : uris) { - Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); - // If a recrawling is defined, check whether we can directly add it back to the - // queue - if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { - // Create a new uri object reusing only meta data that is useful - CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); - recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); - addNewUri(recrawlUri); - } else { - knownUriFilter.add(uri, System.currentTimeMillis()); - } - } - } - - @Override - public int getNumberOfPendingUris() { - if (queue instanceof IpAddressBasedQueue) { - return ((IpAddressBasedQueue) queue).getNumberOfBlockedIps(); - } else { - return 0; - } - } - - @Override - public boolean doesRecrawling() { - return doesRecrawling; - } - - @Override - public void close() { - timerRecrawling.cancel(); - } - - public static long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - - /** - * Getter for the {@link #queue}. - * - * @return The waiting queue for the URIs. - */ - public UriQueue getQueue() { - return queue; - } + // If we should give the crawled IPs to the queue + if (queue instanceof IpAddressBasedQueue) { + Set ips = new HashSet<>(); + InetAddress ip; + for (CrawleableUri uri : uris) { + ip = uri.getIpAddress(); + if (ip != null) { + ips.add(ip); + } + } + ips.forEach(_ip -> ((IpAddressBasedQueue) queue).markIpAddressAsAccessible(_ip)); + } + // send list of crawled URIs to the knownUriFilter + for (CrawleableUri uri : uris) { + Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); + // If a recrawling is defined, check whether we can directly add it back to the + // queue + if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { + // Create a new uri object reusing only meta data that is useful + CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); + recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); + addNewUri(recrawlUri); + } else { + knownUriFilter.add(uri, System.currentTimeMillis()); + } + } + } + + @Override + public int getNumberOfPendingUris() { + if (queue instanceof IpAddressBasedQueue) { + return ((IpAddressBasedQueue) queue).getNumberOfBlockedIps(); + } else { + return 0; + } + } + + @Override + public boolean doesRecrawling() { + return doesRecrawling; + } + + @Override + public void close() { + timerRecrawling.cancel(); + } + + public static long getGeneralRecrawlTime() { + return generalRecrawlTime; + } + + /** + * Getter for the {@link #queue}. + * + * @return The waiting queue for the URIs. + */ + public UriQueue getQueue() { + return queue; + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java index ef986dfe5..1fe72942d 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java @@ -10,7 +10,7 @@ import org.apache.jena.query.QueryFactory; public class FrontierQueryGenerator { - /** + /** * The instance of the class QueryGenerator. */ private static final FrontierQueryGenerator instance = new FrontierQueryGenerator(); @@ -24,7 +24,7 @@ private FrontierQueryGenerator() { /** * Getter for {@link #instance}. * - * @return instannce of the class. + * @return instance of the class. */ public static FrontierQueryGenerator getInstance() { return instance; @@ -85,97 +85,46 @@ public String getAddQuery(String graphId, Collection listBufferedTriples * It will return triples with time stamp contained in the default graph. * @return All triples with time stamp in the default graph. */ - + public Query getOutdatedUrisQuery() { - return getOutdatedUrisQuery(null, true); + return getOutdatedUrisQuery(null); } - public Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { + public Query getOutdatedUrisQuery(String graphUri) { StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("PREFIX sq: \n" + - "PREFIX prov: \n" + - "PREFIX xsd: " - + "SELECT ?uri WHERE { \n "); + stringBuilder.append("PREFIX sq: \n" + + "PREFIX prov: \n" + + "PREFIX xsd: " + + "SELECT ?uri WHERE { \n "); // + "SELECT ?uri WHERE { \n "); - if (!defaultGraph) { - stringBuilder.append("GRAPH <"); - stringBuilder.append(graphID); - stringBuilder.append("> { "); - } - stringBuilder.append("{\n" + - "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + - "WHERE{\n" + - "\n" + - " {\n" + - " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + - " WHERE\n" + - " { \n" + - " ?s sq:crawled ?uri ;\n" + - " prov:endedAtTime ?timestamp.\n" + - "\n" + - " }\n" + - " GROUP BY ?uri\n" + - " } \n" + - "}\n" + - "}\n" + - "FILTER(?diff > \"18000\"^^xsd:double)\n" + - ""); - if (!defaultGraph) { - stringBuilder.append("}"); - } - - // stringBuilder.append("}GROUP BY ?uri"); - stringBuilder.append("}"); - - Query query = QueryFactory.create(stringBuilder.toString()); - return query; - } - - public Query getSelectQuery() { - return getSelectQuery(null, true); - } - /** - * Return a select query for the given graphID or default graph. - * It will return all triples contained in the graph. - * @return All triples contained in the default graph. - * @param graphID The id of the graph from which you want to select. - * @param defaultGraph Identify if query is for the default graph - * @return All triples contained in the graph. - */ - public Query getSelectQuery(String graphID, boolean defaultGraph) { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("SELECT ?subject ?predicate ?object WHERE { "); - if (!defaultGraph) { + /* if (!defaultGraph) { stringBuilder.append("GRAPH <"); stringBuilder.append(graphID); stringBuilder.append("> { "); - } - stringBuilder.append("?subject ?predicate ?object "); - if (!defaultGraph) { - stringBuilder.append("} "); - } + }*/ + stringBuilder.append("{\n" + + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + + "WHERE{\n" + + "\n" + + " {\n" + + " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + + " WHERE\n" + + " { \n" + + " ?s sq:crawled ?uri ;\n" + + " prov:endedAtTime ?timestamp.\n" + + "\n" + + " }\n" + + " GROUP BY ?uri\n" + + " } \n" + + "}\n" + + "}\n" + + "FILTER(?diff > \"18000\"^^xsd:double)\n" + + ""); stringBuilder.append("}"); + Query query = QueryFactory.create(stringBuilder.toString()); return query; } - /** - * Return a select query for the given graphID. - * It will return all triples contained in the graph. - * @param graphID The id of the graph from which you want to select. - * @return All triples contained in the graph. - */ - public Query getSelectQuery(String graphID) { - return getSelectQuery(graphID, false); - } - - /** - * Formats the node for a query - * - * @param node The node which should formated - * @return a robust representation of the node - *

- * Note: Should be updated in relation to the robustness of parsing. - */ public static String formatNodeToString(Node node) { StringBuilder stringBuilder = new StringBuilder(); if (node.isURI()) { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 8c4711dc8..9940421ea 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -8,13 +8,14 @@ import java.net.URI; import java.util.ArrayList; import java.util.List; + import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; -import org.dice_research.squirrel.configurator.SparqlConfiguration; +import org.dice_research.squirrel.configurator.SparqlConnector; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; @@ -22,189 +23,170 @@ import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; @SuppressWarnings("deprecation") public class FrontierImplTest { - private static FrontierImpl frontier; - private static MongoDBIpBasedQueue queue; - private static MongoDBKnowUriFilter filter; - private static List uris = new ArrayList(); - private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private KnownOutDatedUriFilter knownOutDatedUriFilter = SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - - @Before - public void setUp() throws Exception { - - - MongoDBBasedTest.setUpMDB(); - - filter = new MongoDBKnowUriFilter("localhost", 58027); - queue = new MongoDBIpBasedQueue("localhost", 58027); - filter.open(); - queue.open(); - - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true, 18000, 18000, null, knownOutDatedUriFilter); - - uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), - UriType.DEREFERENCEABLE)); - uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), - UriType.DEREFERENCEABLE)); - } - - @Test - public void getNextUris() throws Exception { - queue.addCrawleableUri(uris.get(1)); - - List nextUris = frontier.getNextUris(); - List assertion = new ArrayList(); - assertion.add(uris.get(1)); - - assertEquals("Should be dbr:New_York", assertion, nextUris); - } - - @Test - public void addNewUris() throws Exception { - queue.purge(); - filter.purge(); - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - - List assertion = new ArrayList(); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), - InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("194.109.129.58"), - UriType.DEREFERENCEABLE)); - - assertEquals("Should be the same as uris array", assertion, nextUris); - } - - @Test - public void addNewUri() throws Exception { - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), null, UriType.UNKNOWN); - frontier.addNewUri(uri_1); - List nextUris = frontier.getNextUris(); - List assertion = new ArrayList<>(); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), - InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); - assertEquals(assertion, nextUris); - } - - @Test - public void crawlingDone() throws Exception { - List crawledUris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/New_York"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/Moscow"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - - crawledUris.add(uri_1); - crawledUris.add(uri_2); - - // frontier.addNewUris(crawledUris); - // filter.add(uri_1, 100); - - frontier.crawlingDone(crawledUris); - assertFalse("uri_1 has been already crawled", frontier.knownUriFilter.isUriGood(uri_1)); - } - - @Test - public void getNumberOfPendingUris() throws Exception { - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - int numberOfPendingUris = frontier.getNumberOfPendingUris(); - assertEquals(1, numberOfPendingUris); - - numberOfPendingUris = frontier.getNumberOfPendingUris(); - assertEquals(2, nextUris.size()); - } - - /* - * see https://github.com/dice-group/Squirrel/issues/47 - */ - //@Test - public void simlpeRecrawling() throws Exception { - // Add the URIs to the frontier - List uris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - uris.add(uri_1); - uris.add(uri_2); - - frontier.addNewUris(uris); - - List nextUris = frontier.getNextUris(); - for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); - } - for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); - } - - // Set the first URI as recrawlable - for (CrawleableUri uri : nextUris) { - if(uri.getUri().equals(uri_1.getUri())) { - uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); - } - } - - frontier.crawlingDone(uris); - - uris.add(uri_1); - uris.add(uri_2); - - nextUris = frontier.getNextUris(); - Assert.assertNotNull(nextUris); - assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(1, nextUris.size()); - assertFalse("uri_2 has been found but was not expected", nextUris.contains(uri_2)); - } - - @Test - public void RecrawlingTest() throws Exception { - // Add the URIs to the frontier - List uris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled")); - uri_1.addData("endedAtTime", "2019-07-06T17:04:02.864Z"); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri")); - uri_2.addData("endedAtTime", "2019-07-06T19:38:02.864Z"); - uris.add(uri_1); - uris.add(uri_2); - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); - } - for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); - } - - // Set the first URI as recrawlable - for (CrawleableUri uri : nextUris) { - if(uri.getUri().equals(uri_1.getUri())) { - uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); - } - } - Assert.assertNotNull(nextUris); - assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(2, nextUris.size()); - } - - @After - public void tearDown() throws Exception { - filter.purge(); - queue.purge(); - String rethinkDockerStopCommand = "docker stop squirrel-test-frontierimpl"; - Process p = Runtime.getRuntime().exec(rethinkDockerStopCommand); - p.waitFor(); - String rethinkDockerRmCommand = "docker rm squirrel-test-frontierimpl"; - p = Runtime.getRuntime().exec(rethinkDockerRmCommand); - p.waitFor(); - } + private static FrontierImpl frontier; + private static MongoDBIpBasedQueue queue; + private static MongoDBKnowUriFilter filter; + private static List uris = new ArrayList(); + private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); + private OutDatedUris outDatedUris = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + + @Before + public void setUp() throws Exception { + MongoDBBasedTest.setUpMDB(); + filter = new MongoDBKnowUriFilter("localhost", 58027); + queue = new MongoDBIpBasedQueue("localhost", 58027); + filter.open(); + queue.open(); + frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUris); + uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), + UriType.DEREFERENCEABLE)); + uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), + UriType.DEREFERENCEABLE)); + } + + @Test + public void getNextUris() throws Exception { + queue.addCrawleableUri(uris.get(1)); + List nextUris = frontier.getNextUris(); + List assertion = new ArrayList(); + assertion.add(uris.get(1)); + assertEquals("Should be dbr:New_York", assertion, nextUris); + } + + @Test + public void addNewUris() throws Exception { + queue.purge(); + filter.purge(); + frontier.addNewUris(uris); + List nextUris = frontier.getNextUris(); + List assertion = new ArrayList(); + assertion.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), + InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); + assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("194.109.129.58"), + UriType.DEREFERENCEABLE)); + assertEquals("Should be the same as uris array", assertion, nextUris); + } + + @Test + public void addNewUri() throws Exception { + CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), null, UriType.UNKNOWN); + frontier.addNewUri(uri_1); + List nextUris = frontier.getNextUris(); + List assertion = new ArrayList<>(); + assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), + InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); + assertEquals(assertion, nextUris); + } + + @Test + public void crawlingDone() throws Exception { + List crawledUris = new ArrayList<>(); + CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/New_York"), + InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); + CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/Moscow"), + InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); + crawledUris.add(uri_1); + crawledUris.add(uri_2); + + // frontier.addNewUris(crawledUris); + // filter.add(uri_1, 100); + + frontier.crawlingDone(crawledUris); + assertFalse("uri_1 has been already crawled", frontier.knownUriFilter.isUriGood(uri_1)); + } + + @Test + public void getNumberOfPendingUris() throws Exception { + frontier.addNewUris(uris); + List nextUris = frontier.getNextUris(); + int numberOfPendingUris = frontier.getNumberOfPendingUris(); + assertEquals(1, numberOfPendingUris); + numberOfPendingUris = frontier.getNumberOfPendingUris(); + assertEquals(2, nextUris.size()); + } + + /* + * see https://github.com/dice-group/Squirrel/issues/47 + */ + //@Test + public void simlpeRecrawling() throws Exception { + // Add the URIs to the frontier + List uris = new ArrayList<>(); + CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled"), + InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); + CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri"), + InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); + uris.add(uri_1); + uris.add(uri_2); + frontier.addNewUris(uris); + List nextUris = frontier.getNextUris(); + for (CrawleableUri uri : nextUris) { + Assert.assertTrue(uris.contains(uri)); + } + for (CrawleableUri uri : uris) { + Assert.assertTrue(nextUris.contains(uri)); + } + // Set the first URI as recrawlable + for (CrawleableUri uri : nextUris) { + if (uri.getUri().equals(uri_1.getUri())) { + uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); + } + } + frontier.crawlingDone(uris); + uris.add(uri_1); + uris.add(uri_2); + nextUris = frontier.getNextUris(); + Assert.assertNotNull(nextUris); + assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); + Assert.assertEquals(1, nextUris.size()); + assertFalse("uri_2 has been found but was not expected", nextUris.contains(uri_2)); + } + + @Test + public void RecrawlingTest() throws Exception { + // Add the URIs to the frontier + List uris = new ArrayList<>(); + CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled")); + uri_1.addData("endedAtTime", "2019-07-06T17:04:02.864Z"); + CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri")); + uri_2.addData("endedAtTime", "2019-07-06T19:38:02.864Z"); + uris.add(uri_1); + uris.add(uri_2); + frontier.addNewUris(uris); + List nextUris = frontier.getNextUris(); + for (CrawleableUri uri : nextUris) { + Assert.assertTrue(uris.contains(uri)); + } + for (CrawleableUri uri : uris) { + Assert.assertTrue(nextUris.contains(uri)); + } + + // Set the first URI as recrawlable + for (CrawleableUri uri : nextUris) { + if (uri.getUri().equals(uri_1.getUri())) { + uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); + } + } + Assert.assertNotNull(nextUris); + assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); + Assert.assertEquals(2, nextUris.size()); + } + + @After + public void tearDown() throws Exception { + filter.purge(); + queue.purge(); + String rethinkDockerStopCommand = "docker stop squirrel-test-frontierimpl"; + Process p = Runtime.getRuntime().exec(rethinkDockerStopCommand); + p.waitFor(); + String rethinkDockerRmCommand = "docker rm squirrel-test-frontierimpl"; + p = Runtime.getRuntime().exec(rethinkDockerRmCommand); + p.waitFor(); + } } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 0f98da824..073f40843 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -1,8 +1,8 @@ package org.dice_research.squirrel.seed.generator.impl; -import org.dice_research.squirrel.configurator.SparqlConfiguration; +import org.dice_research.squirrel.configurator.SparqlConnector; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; +import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.impl.FrontierImpl; @@ -22,11 +22,11 @@ public class CkanSeedGeneratorImplTest extends TestCase { private CkanSeedGeneratorImpl ckanSeedGenerator; private IpAddressBasedQueue queue; private Frontier frontier; - private KnownOutDatedUriFilter knownOutDatedUriFilter = SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + private OutDatedUris outDatedUris = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); public void setUp() { queue = new InMemoryQueue(); - frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,knownOutDatedUriFilter); + frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue, outDatedUris); ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java index a8b438ed5..c413f7961 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java @@ -64,21 +64,17 @@ public void init() throws Exception { super.init(); UriSetRequest uriSetReq = new UriSetRequest(worker.getUri(), false); - uriSetRequest = serializer.serialize(uriSetReq); - deduplicationActive = EnvVariables.getBoolean(Constants.DEDUPLICATION_ACTIVE_KEY, - Constants.DEFAULT_DEDUPLICATION_ACTIVE, LOGGER); - + Constants.DEFAULT_DEDUPLICATION_ACTIVE, LOGGER); senderFrontier = DataSenderImpl.builder().queue(outgoingDataQueuefactory, Constants.FRONTIER_QUEUE_NAME) - .build(); - + .build(); if (deduplicationActive) { senderDeduplicator = DataSenderImpl.builder() - .queue(outgoingDataQueuefactory, Constants.DEDUPLICATOR_QUEUE_NAME).build(); + .queue(outgoingDataQueuefactory, Constants.DEDUPLICATOR_QUEUE_NAME).build(); } clientFrontier = RabbitRpcClient.create(outgoingDataQueuefactory.getConnection(), - Constants.FRONTIER_QUEUE_NAME); + Constants.FRONTIER_QUEUE_NAME); if (worker.sendsAliveMessages()) { timerAliveMessages.schedule(new TimerTask() { @@ -214,4 +210,4 @@ public boolean doesRecrawling() { return false; } -} \ No newline at end of file +} diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TripleBuffer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TripleBuffer.java index 287d6491d..8c17d8bb4 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TripleBuffer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TripleBuffer.java @@ -11,7 +11,7 @@ public class TripleBuffer { protected final List buffer; protected final int bufferSize; protected long numberOfTriples = 0; - + public TripleBuffer() { this(AbstractBufferingTripleBasedSink.DEFAULT_BUFFER_SIZE); } @@ -41,4 +41,4 @@ public void sendTriples(AbstractBufferingTripleBasedSink sink, CrawleableUri uri public long getNumberOfTriples() { return numberOfTriples; } -} \ No newline at end of file +} From bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 16 Oct 2019 16:43:54 +0200 Subject: [PATCH 016/102] changes related to PR --- spring-config/frontier-context.xml | 2 +- .../squirrel/components/FrontierComponent.java | 2 +- ...qlConnector.java => SparqlhostConnector.java} | 16 +++++++--------- .../frontier/impl/FrontierQueryGenerator.java | 13 +++++++------ .../squirrel/frontier/impl/FrontierImplTest.java | 4 ++-- .../impl/CkanSeedGeneratorImplTest.java | 4 ++-- 6 files changed, 20 insertions(+), 21 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/{SparqlConnector.java => SparqlhostConnector.java} (87%) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index bf1ad39f4..61566db56 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -54,7 +54,7 @@ - + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 4c904f9a4..b6196a871 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -77,7 +77,7 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - SparqlConnector sp = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); hasUrisToCrawl = new HashMap(); if (mongoConfiguration != null) { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java similarity index 87% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java index 02afa0b07..723645296 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java @@ -16,7 +16,6 @@ import org.apache.http.impl.client.AbstractHttpClient; import org.apache.http.protocol.HttpContext; import org.apache.jena.atlas.web.auth.HttpAuthenticator; -import org.apache.jena.graph.Triple; import org.apache.jena.query.Query; import org.apache.jena.query.QueryExecution; import org.apache.jena.query.QuerySolution; @@ -25,15 +24,14 @@ import org.apache.jena.sparql.core.DatasetDescription; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.OutDatedUris; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SuppressWarnings("deprecation") -public class SparqlConnector implements OutDatedUris { +public class SparqlhostConnector implements OutDatedUris { - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConnector.class); + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); /** * The Query factory used to query the SPARQL endpoint. @@ -42,17 +40,17 @@ public class SparqlConnector implements OutDatedUris { protected UpdateExecutionFactory updateExecFactory = null; List urisToRecrawl = new ArrayList<>(); - public SparqlConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { + public SparqlhostConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { this.queryExecFactory = queryExecFactory; this.updateExecFactory = updateExecFactory; LOGGER.info("Connected"); } - public static SparqlConnector create(String sparqlEndpointUrl) { + public static SparqlhostConnector create(String sparqlEndpointUrl) { return create(sparqlEndpointUrl, null, null); } - public static SparqlConnector create(String sparqlEndpointUrl, String username, String password) { + public static SparqlhostConnector create(String sparqlEndpointUrl, String username, String password) { QueryExecutionFactory queryExecFactory = null; UpdateExecutionFactory updateExecFactory = null; if (username != null && password != null) { @@ -89,13 +87,13 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); } - return new SparqlConnector(queryExecFactory, updateExecFactory); + return new SparqlhostConnector(queryExecFactory, updateExecFactory); } @Override public List getUriToRecrawl() { - SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); System.out.println(getOutdatedUrisQuery); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java index 1fe72942d..fe52dde03 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java @@ -84,23 +84,24 @@ public String getAddQuery(String graphId, Collection listBufferedTriples * Return a time stamp query for the default graph. * It will return triples with time stamp contained in the default graph. * @return All triples with time stamp in the default graph. + * @param */ public Query getOutdatedUrisQuery() { - return getOutdatedUrisQuery(null); + return getOutdatedUrisQuery(null,true); } - public Query getOutdatedUrisQuery(String graphUri) { + public Query getOutdatedUrisQuery(String graphUri, boolean defaultGraph) { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("PREFIX sq: \n" + "PREFIX prov: \n" + "PREFIX xsd: " + "SELECT ?uri WHERE { \n "); // + "SELECT ?uri WHERE { \n "); - /* if (!defaultGraph) { + if (graphUri == null) { stringBuilder.append("GRAPH <"); - stringBuilder.append(graphID); + stringBuilder.append(defaultGraph); stringBuilder.append("> { "); - }*/ + } stringBuilder.append("{\n" + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + "WHERE{\n" + @@ -124,7 +125,7 @@ public Query getOutdatedUrisQuery(String graphUri) { Query query = QueryFactory.create(stringBuilder.toString()); return query; } - + @Deprecated public static String formatNodeToString(Node node) { StringBuilder stringBuilder = new StringBuilder(); if (node.isURI()) { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 9940421ea..ecddc7e39 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -11,7 +11,7 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; -import org.dice_research.squirrel.configurator.SparqlConnector; +import org.dice_research.squirrel.configurator.SparqlhostConnector; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; @@ -34,7 +34,7 @@ public class FrontierImplTest { private static MongoDBKnowUriFilter filter; private static List uris = new ArrayList(); private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private OutDatedUris outDatedUris = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + private OutDatedUris outDatedUris = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); @Before public void setUp() throws Exception { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 073f40843..66035e9f0 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -1,6 +1,6 @@ package org.dice_research.squirrel.seed.generator.impl; -import org.dice_research.squirrel.configurator.SparqlConnector; +import org.dice_research.squirrel.configurator.SparqlhostConnector; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.OutDatedUris; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -22,7 +22,7 @@ public class CkanSeedGeneratorImplTest extends TestCase { private CkanSeedGeneratorImpl ckanSeedGenerator; private IpAddressBasedQueue queue; private Frontier frontier; - private OutDatedUris outDatedUris = SparqlConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + private OutDatedUris outDatedUris = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); public void setUp() { queue = new InMemoryQueue(); From 21814084fd2157843741ca5e2efa3703f720d3ae Mon Sep 17 00:00:00 2001 From: param-jot Date: Mon, 28 Oct 2019 08:10:59 +0100 Subject: [PATCH 017/102] Rearrange file packages and reformat the code --- docker-compose-sparql.yml | 36 +- spring-config/frontier-context.xml | 67 +- spring-config/worker-context.xml | 208 +++--- .../uri/filter/KnownOutDatedUriFilter.java | 17 - ...tedUris.java => OutDatedUriRetreiver.java} | 3 +- .../components/FrontierComponent.java | 334 +-------- .../configurator/SparqlConfiguration.java | 120 ---- .../frontier/impl/ExtendedFrontierImpl.java | 31 +- .../squirrel/frontier/impl/FrontierImpl.java | 663 ++---------------- .../frontier/impl/FrontierQueryGenerator.java | 265 ------- .../recrawling/FrontierQueryGenerator.java | 78 +++ .../recrawling}/SparqlhostConnector.java | 24 +- .../frontier/impl/FrontierImplTest.java | 210 +----- .../impl/CkanSeedGeneratorImplTest.java | 23 +- 14 files changed, 349 insertions(+), 1730 deletions(-) delete mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java rename squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/{OutDatedUris.java => OutDatedUriRetreiver.java} (82%) delete mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java delete mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java rename squirrel.frontier/src/main/java/org/dice_research/squirrel/{configurator => frontier/recrawling}/SparqlhostConnector.java (91%) diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 5b52e6da7..6c01e48c8 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -44,14 +44,14 @@ services: - Driver=/usr/local/lib/virtodbc_32.so - DBA_PASSWORD=pw123 -# sparqlhost: -# image: stain/jena-fuseki -# container_name: sparqlhost -# ports: -# - "3030:3030" -# environment: -# - ADMIN_PASSWORD=pw123 -# - JVM_ARGS=-Xmx2g + # sparqlhost: + # image: stain/jena-fuseki + # container_name: sparqlhost + # ports: + # - "3030:3030" + # environment: + # - ADMIN_PASSWORD=pw123 + # - JVM_ARGS=-Xmx2g mongodb: image: mongo:4.0.0 @@ -77,7 +77,7 @@ services: - "8081:15672" # Forwarding the port for testing - "5672:5672" - + worker1: image: squirrel.worker:latest container_name: worker1 @@ -92,9 +92,9 @@ services: - DEDUPLICATION_ACTIVE=false - JVM_ARGS=-Xmx8g volumes: - - ./data/worker1:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config + - ./data/worker1:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter worker2: @@ -111,9 +111,9 @@ services: - DEDUPLICATION_ACTIVE=false - JVM_ARGS=-Xmx8g volumes: - - ./data/worker2:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config + - ./data/worker2:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter worker3: @@ -130,9 +130,9 @@ services: - DEDUPLICATION_ACTIVE=true - JVM_ARGS=-Xmx8g volumes: - - ./data/worker3:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config + - ./data/worker3:/var/squirrel/data + - ./yaml:/var/squirrel/yaml + - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter # deduplicator: diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 464d8a747..2b1d8dc69 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -1,22 +1,15 @@ + http://www.springframework.org/schema/context/spring-context.xsd"> + base-package="org.dice_research.squirrel"/> - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + diff --git a/spring-config/worker-context.xml b/spring-config/worker-context.xml index 14aab4d83..0752b6049 100644 --- a/spring-config/worker-context.xml +++ b/spring-config/worker-context.xml @@ -1,11 +1,11 @@ + base-package="org.dice_research.squirrel"/> - - - - - - - - - + class="org.dice_research.squirrel.worker.impl.WorkerImpl"> + + + + + + + + - - - - - - -<<<<<<< HEAD -<<<<<<< HEAD - - - -======= -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + value="#{systemEnvironment['OUTPUT_FOLDER']}/log"/> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + class="org.dice_research.squirrel.fetcher.manage.SimpleOrderedFetcherManager"> - - - - - - - @@ -117,67 +114,66 @@ + class="org.dice_research.squirrel.fetcher.http.HTTPFetcher"/> + class="org.dice_research.squirrel.fetcher.ftp.FTPFetcher"/> + class="org.dice_research.squirrel.fetcher.sparql.SparqlBasedFetcher"/> + class="org.dice_research.squirrel.fetcher.sparql.SparqlDatasetFetcher"> - + class="org.dice_research.squirrel.fetcher.ckan.java.SimpleCkanFetcher"/> + value="#{systemEnvironment['OUTPUT_FOLDER']}"/> + class="org.dice_research.squirrel.sink.impl.file.FileBasedSink"> - + ref="outputFolderBean"/> + - + + class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer"/> - - + class="org.dice_research.squirrel.collect.SqlBasedUriCollector"> + + - + class="org.dice_research.squirrel.robots.RobotsManagerImpl"> + - + class="crawlercommons.fetcher.http.SimpleHttpFetcher"> + - - - + class="crawlercommons.fetcher.http.UserAgent"> + + + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java deleted file mode 100644 index 21d7835db..000000000 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownOutDatedUriFilter.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.dice_research.squirrel.data.uri.filter; - -import java.util.List; - -import org.dice_research.squirrel.data.uri.CrawleableUri; - -public interface KnownOutDatedUriFilter { - - - /** - * Returns all {@link CrawleableUri}s which have to be recrawled. This means their time to next crawl has passed. - * - * @return The outdated {@link CrawleableUri}s. - */ - public List getUriToRecrawl(); - -} \ No newline at end of file diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java similarity index 82% rename from squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java index 7bf589d25..e66754c88 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUris.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java @@ -1,10 +1,11 @@ package org.dice_research.squirrel.data.uri.filter; +import java.io.Closeable; import java.util.List; import org.dice_research.squirrel.data.uri.CrawleableUri; -public interface OutDatedUris { +public interface OutDatedUriRetreiver extends Closeable { /** diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 58a5bd7a6..67016b77b 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -1,40 +1,19 @@ package org.dice_research.squirrel.components; -<<<<<<< HEAD -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Timer; -import java.util.TimerTask; -import java.util.concurrent.Semaphore; - -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; -import org.apache.jena.base.Sys; import org.dice_research.squirrel.Constants; -<<<<<<< HEAD import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.configurator.SeedConfiguration; -import org.dice_research.squirrel.configurator.SparqlConfiguration; import org.dice_research.squirrel.configurator.WebConfiguration; import org.dice_research.squirrel.configurator.WhiteListConfiguration; -======= -import org.dice_research.squirrel.configurator.*; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriSeedReader; import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; +import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -61,28 +40,26 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; + import java.io.Closeable; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.*; +import java.util.Map.Entry; import java.util.concurrent.Semaphore; @Component @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { -<<<<<<< HEAD -<<<<<<< HEAD -======= public static final boolean RECRAWLING_ACTIVE = true; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); protected static QueryExecutionFactory queryExecFactory = null; private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; - @Qualifier("queueBean") + @Qualifier("sparqlBean") @Autowired protected UriQueue queue; protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; @@ -90,7 +67,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("knowUriFilterBean") @Autowired private KnownUriFilter knownUriFilter; - private OutDatedUris outDatedUris; + OutDatedUriRetreiver outDatedUriRetreiver; private URIReferences uriReferences = null; private Frontier frontier; private RabbitQueue rabbitQueue; @@ -99,33 +76,24 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Autowired private Serializer serializer; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; -<<<<<<< HEAD - public static final boolean RECRAWLING_ACTIVE = true; -======= + // public static final boolean RECRAWLING_ACTIVE = true; + private Map hasUrisToCrawl; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 @Override public void init() throws Exception { super.init(); serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); -<<<<<<< HEAD -======= - SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + // SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); hasUrisToCrawl = new HashMap(); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 if (mongoConfiguration != null) { queue.open(); knownUriFilter.open(); -<<<<<<< HEAD - -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); if (whiteListConfiguration != null) { File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); @@ -142,7 +110,7 @@ public void init() throws Exception { knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); } // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUris); + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUriRetreiver); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); @@ -189,11 +157,13 @@ public void close() throws IOException { if (receiver != null) // Force the receiver to close receiver.close(); +/* <<<<<<< HEAD // receiver.closeWhenFinished(); ======= // receiver.closeWhenFinished(); >>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 +*/ if (queue != null) queue.close(); @@ -266,16 +236,15 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue uris == null ? "null" : Integer.toString(uris.size())); handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); if (uris != null && uris.size() > 0) { -<<<<<<< HEAD + workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), - uris); -======= + uris); + hasUrisToCrawl.put(uriSetRequest.getWorkerId(), true); workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), uris); } else { hasUrisToCrawl.put(uriSetRequest.getWorkerId(), false); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 } } catch (IOException e) { LOGGER.error("Couldn't serialize new URI set.", e); @@ -311,7 +280,6 @@ public void setFrontier(FrontierImpl frontier) { public WorkerGuard getWorkerGuard() { return workerGuard; } -<<<<<<< HEAD private static class TerminatorTask extends TimerTask { @@ -339,278 +307,7 @@ public void run() { } } - if(!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { - LOGGER.info(" << FRONTIER IS TERMINATING! >> "); - terminationMutex.release(); - } - } - - } -} -======= - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); - - @Qualifier("queueBean") - @Autowired - protected UriQueue queue; - @Qualifier("knowUriFilterBean") - @Autowired - private KnownUriFilter knownUriFilter; - private KnownOutDatedUriFilter knownOutDatedUriFilter; - private URIReferences uriReferences = null; - private Frontier frontier; - private RabbitQueue rabbitQueue; - private DataReceiver receiver; - @Qualifier("serializerBean") - @Autowired - private Serializer serializer; - private final Semaphore terminationMutex = new Semaphore(0); - private final WorkerGuard workerGuard = new WorkerGuard(this); - private final boolean doRecrawling = true; - private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; - - private Map hasUrisToCrawl; - - public static final boolean RECRAWLING_ACTIVE = true; - protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; - - protected static QueryExecutionFactory queryExecFactory = null; - - protected UpdateExecutionFactory updateExecFactory = null; - - - - - @Override - public void init() throws Exception { - super.init(); - serializer = new GzipJavaUriSerializer(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - SparqlConfiguration sp = SparqlConfiguration.create("http://localhost:8890/sparql-auth","dba","pw123"); - hasUrisToCrawl = new HashMap(); - - - if (mongoConfiguration != null) { - - queue.open(); - knownUriFilter.open(); - - - WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); - if (whiteListConfiguration != null) { - File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); - knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); - } - - // TODO Reactivate me but with a different configuration - // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { - // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); - // uriReferences.open(); - // } - } else { - LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); - queue = new InMemoryQueue(); - knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); - } - - // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(),knownUriFilter, uriReferences, queue, doRecrawling, knownOutDatedUriFilter); - rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); - receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) - .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); - - SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); - if (seedConfiguration != null) { - processSeedFile(seedConfiguration.getSeedFile()); - } - - LOGGER.info("Frontier initialized."); - - if (webConfiguration.isCommunicationWithWebserviceEnabled()) { - final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, - workerGuard, queue, knownUriFilter, uriReferences); - LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " - + webConfiguration.isVisualizationOfCrawledGraphEnabled()); - Thread senderThread = new Thread(sender); - senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); - senderThread.start(); - LOGGER.info("Started thread [" + senderThread.getName() + "] "); - } else { - LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " - + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" - + webConfiguration.isVisualizationOfCrawledGraphEnabled() - + ". No WebServiceSenderThread will be started!"); - } - - } - - - - @Override - public void run() throws Exception { - TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); - Timer timer = new Timer(); - timer.schedule(terminatorTask, 5000,5000); - terminationMutex.acquire(); - timer.cancel(); - } - - @Override - public void close() throws IOException { - LOGGER.info("Closing Frontier Component."); - if (receiver != null) - // Force the receiver to close - receiver.close(); - // receiver.closeWhenFinished(); - - if (queue != null) - queue.close(); - if (uriReferences != null) - uriReferences.close(); - if (knownUriFilter instanceof Closeable) { - ((Closeable) knownUriFilter).close(); - } - workerGuard.shutdown(); - if (frontier != null) - frontier.close(); - super.close(); - LOGGER.info("Frontier Component Closed."); - } - - @Override - public void handleData(byte[] data) { - handleData(data, null, null, null); - } - - @Override - public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { - Object deserializedData; - try { - deserializedData = serializer.deserialize(data); - } catch (IOException e) { - // try to convert the string into a single URI, that maybe comes from the - // WebService - // CrawleableUri uri = new CrawleableUriFactoryImpl().create(new String(data)); - // if (uri != null) { - // LOGGER.warn("Received a single URI " + uri.getUri() + " without a wrapping of - // \"org.aksw.simba.squirrel.rabbit.frontier\". We converted it into a - // UriSet."); - // deserializedData = new UriSet(Collections.singletonList(uri)); - // } else { - LOGGER.error("Error while trying to deserialize incoming data. It will be ignored.", e); - return; - // } - } - - if (deserializedData != null) { - if (deserializedData instanceof UriSetRequest) { - responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); - } else if (deserializedData instanceof UriSet) { - // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); - frontier.addNewUris(((UriSet) deserializedData).uris); - } else if (deserializedData instanceof CrawlingResult) { - CrawlingResult crawlingResult = (CrawlingResult) deserializedData; - LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); - frontier.crawlingDone(crawlingResult.uris); - workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); - } else if (deserializedData instanceof AliveMessage) { - AliveMessage message = (AliveMessage) deserializedData; - String idReceived = message.getWorkerId(); - LOGGER.warn("Received alive message from worker with id " + idReceived); - workerGuard.putNewTimestamp(idReceived); - } else { - LOGGER.warn("Received an unknown object {}. It will be ignored.", deserializedData.toString()); - } - } - } - - private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, - UriSetRequest uriSetRequest) { - if (handler != null) { - // get next UriSet - try { - List uris = frontier.getNextUris(); - LOGGER.trace("Responding with a list of {} uris.", - uris == null ? "null" : Integer.toString(uris.size())); - handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); - if (uris != null && uris.size() > 0) { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), true); - workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), - uriSetRequest.workerSendsAliveMessages(), uris); - }else { - hasUrisToCrawl .put(uriSetRequest.getWorkerId(), false); - } - } catch (IOException e) { - LOGGER.error("Couldn't serialize new URI set.", e); - } - } else { - LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); - } - } - - protected void processSeedFile(String seedFile) { - try { - List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); - frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); - } catch (Exception e) { - LOGGER.error("Couldn't process seed file. It will be ignored.", e); - } - } - - public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) { - if (frontier instanceof ExtendedFrontier) { - ((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign); - } - } - - public void setFrontier(FrontierImpl frontier) { - this.frontier = frontier; - } - - public WorkerGuard getWorkerGuard() { - return workerGuard; - } - - private class TerminatorTask extends TimerTask{ - - private UriQueue queue; - private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); - private Semaphore terminationMutex; - - public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { - this.queue = queue; - this.terminationMutex = terminationMutex; - } - - @Override - public void run() { - if(!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { - LOGGER.info(" << FRONTIER IS TERMINATING! >> "); - terminationMutex.release(); - } - } - - } -} ->>>>>>> 98250b4cbed5c441b6c05b561ee7c07c76aadbfa -======= - - private class TerminatorTask extends TimerTask { - - private UriQueue queue; - private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); - private Semaphore terminationMutex; - - public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { - this.queue = queue; - this.terminationMutex = terminationMutex; - } - - @Override - public void run() { - if (!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { + if (!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { LOGGER.info(" << FRONTIER IS TERMINATING! >> "); terminationMutex.release(); } @@ -618,4 +315,3 @@ public void run() { } } ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java deleted file mode 100644 index 8139c33db..000000000 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlConfiguration.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.dice_research.squirrel.configurator; - -import java.net.URI; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.List; - -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryHttp; -import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.Credentials; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.AbstractHttpClient; -import org.apache.http.protocol.HttpContext; -import org.apache.jena.atlas.web.auth.HttpAuthenticator; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.Query; -import org.apache.jena.query.QueryExecution; -import org.apache.jena.query.QuerySolution; -import org.apache.jena.query.ResultSet; -import org.apache.jena.rdf.model.RDFNode; -import org.apache.jena.sparql.core.DatasetDescription; -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@SuppressWarnings("deprecation") -public class SparqlConfiguration implements KnownOutDatedUriFilter { - - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlConfiguration.class); - - /** - * The Query factory used to query the SPARQL endpoint. - */ - protected static QueryExecutionFactory queryExecFactory = null; - protected UpdateExecutionFactory updateExecFactory = null; - List urisToRecrawl = new ArrayList<>(); - - public SparqlConfiguration(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { - this.queryExecFactory = queryExecFactory; - this.updateExecFactory = updateExecFactory; - LOGGER.info("Connected"); - } - - public static SparqlConfiguration create(String sparqlEndpointUrl) { - - return create(sparqlEndpointUrl, null, null); - } - - public static SparqlConfiguration create(String sparqlEndpointUrl, String username, String password) { - QueryExecutionFactory queryExecFactory = null; - UpdateExecutionFactory updateExecFactory = null; - if (username != null && password != null) { - // Create the factory with the credentials - final Credentials credentials = new UsernamePasswordCredentials(username, password); - HttpAuthenticator authenticator = new HttpAuthenticator() { - @Override - public void invalidate() { - } - - @Override - public void apply(AbstractHttpClient client, HttpContext httpContext, URI target) { - client.setCredentialsProvider(new CredentialsProvider() { - @Override - public void clear() { - } - - @Override - public Credentials getCredentials(AuthScope scope) { - return credentials; - } - - @Override - public void setCredentials(AuthScope arg0, Credentials arg1) { - LOGGER.error("I am a read-only credential provider but got a call to set credentials."); - } - }); - } - }; - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), - authenticator); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); - } else { - queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); - } - return new SparqlConfiguration(queryExecFactory, updateExecFactory); - } - - - @Override - public List getUriToRecrawl() { - SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); - System.out.println(getOutdatedUrisQuery); - QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); - ResultSet rs = qe.execSelect(); - while (rs.hasNext()) { - QuerySolution sol = rs.nextSolution(); - RDFNode outdatedUri = sol.get("uri"); - try { - urisToRecrawl.add(new CrawleableUri(new URI((outdatedUri.toString())))); - } catch (URISyntaxException e) { - e.printStackTrace(); - } - }qe.close(); - return urisToRecrawl; -} - - - -} - - diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index f551a6a04..491132120 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -6,9 +6,8 @@ import java.util.Set; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; +import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -34,13 +33,9 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti */ @SuppressWarnings("unused") public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, -<<<<<<< HEAD - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, KnownOutDatedUriFilter knownOutDatedUriFilter) { - super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, knownOutDatedUriFilter); -======= - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUris outDatedUris) { - super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUris); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { + super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUriRetreiver); } /** @@ -53,12 +48,9 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ -<<<<<<< HEAD - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { -======= - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUris knownOutDatedUriFilter) { ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - super(normalizer, knownUriFilter, queue, doesRecrawling, knownOutDatedUriFilter); + + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + super(normalizer, knownUriFilter, queue, doesRecrawling, outDatedUriRetreiver); } /** @@ -72,12 +64,9 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ -<<<<<<< HEAD - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { -======= - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUris knownOutDatedUriFilter) { ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling,knownOutDatedUriFilter); + + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, outDatedUriRetreiver); } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 11baa3c0b..d67c2e574 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -1,31 +1,9 @@ package org.dice_research.squirrel.frontier.impl; -<<<<<<< HEAD -<<<<<<< HEAD -import java.net.UnknownHostException; -import java.util.List; -======= -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -<<<<<<< HEAD ->>>>>>> 98250b4cbed5c441b6c05b561ee7c07c76aadbfa -import java.util.Timer; -import java.util.TimerTask; - -======= -import java.util.Timer; -import java.util.TimerTask; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; +import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; @@ -33,12 +11,16 @@ import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.graph.GraphLogger; -import org.dice_research.squirrel.queue.BlockingQueue; +import org.dice_research.squirrel.queue.IpAddressBasedQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.uri.processing.UriProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.*; + /** * Standard implementation of the {@link Frontier} interface containing a * {@link #queue} and a {@link #knownUriFilter}. @@ -47,27 +29,34 @@ */ public class FrontierImpl implements Frontier { -<<<<<<< HEAD + /** + * Default value for {@link #generalRecrawlTime} (one week). + */ + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 18000; private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - + /** + * Default value for {@link #timerPeriod}. + */ + private static final long DEFAULT_TIMER_PERIOD = 18000; + /** + * Time (in milliseconds) after which uris will be recrawled (only used if no + * specific time is configured for a URI). + */ + private static long generalRecrawlTime; /** * {@link UriNormalizer} used to transform given URIs into a normal form. */ protected UriNormalizer normalizer; - /** * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ protected KnownUriFilter knownUriFilter; - - protected OutDatedUris outDatedUris; - + protected OutDatedUriRetreiver outDatedUriRetreiver; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to * identify URIs that already have been crawled. */ protected URIReferences uriReferences = null; - /** * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. */ @@ -85,68 +74,23 @@ public class FrontierImpl implements Frontier { * {@link GraphLogger} that can be added to log the crawled graph. */ protected GraphLogger graphLogger; - /** * Indicates whether recrawling is active. */ private boolean doesRecrawling; - /** * The timer that schedules the recrawling. */ private Timer timerRecrawling; - - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ - private static long generalRecrawlTime; - /** * Time interval(in milliseconds) at which the check for outdated uris is * performed. */ private long timerPeriod; - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME =18000 ; - /** - * Default value for {@link #timerPeriod}. - */ -<<<<<<< HEAD - private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; -======= - private static final long DEFAULT_TIMER_PERIOD = 18000; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, - GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); -======= * @param normalizer {@link UriNormalizer} used to transform given URIs * into a normal form * @param knownUriFilter {@link UriFilter} used to identify URIs that @@ -161,36 +105,14 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param timerPeriod used to select if URIs should be recrawled. */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod,OutDatedUris outDatedUris) { + long generalRecrawlTime, long timerPeriod, OutDatedUriRetreiver outDatedUriRetreiver) { this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod,outDatedUris); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + generalRecrawlTime, timerPeriod, outDatedUriRetreiver); } /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod); -======= * @param normalizer {@link UriNormalizer} used to transform given URIs * into a normal form * @param knownUriFilter {@link UriFilter} used to identify URIs that @@ -205,34 +127,14 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUris knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, - timerPeriod, knownOutDatedUriFilter); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, + timerPeriod, outDatedUriRetreiver); } /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, - UriQueue queue, boolean doesRecrawling) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); -======= * @param normalizer {@link UriNormalizer} used to transform given URIs into * a normal form * @param knownUriFilter {@link UriFilter} used to identify URIs that already @@ -243,32 +145,14 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI * @param doesRecrawling Value for {@link #doesRecrawling}. */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling,OutDatedUris outDatedUris) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUris); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); } /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, - boolean doesRecrawling) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); -======= * @param normalizer {@link UriNormalizer} used to transform given URIs into * a normal form * @param knownUriFilter {@link UriFilter} used to identify URIs that already @@ -278,96 +162,55 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param doesRecrawling Value for {@link #doesRecrawling}. */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUris outDatedUris) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUris); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); } /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. -======= * @param normalizer {@link UriNormalizer} used to transform given URIs into * a normal form * @param knownUriFilter {@link UriFilter} used to identify URIs that already * have been crawled. * @param queue {@link UriQueue} used to manage the URIs that should be * crawled. ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUris outDatedUris) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, outDatedUris); + KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetreiver outDatedUriRetreiver) { + this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); } /** * Constructor. * -<<<<<<< HEAD - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, - UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod) { -======= - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - * @param outDatedUris + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + * @param outDatedUriRetreiver */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, - boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUris outDatedUris) { ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUriRetreiver outDatedUriRetreiver) { this.normalizer = normalizer; this.knownUriFilter = knownUriFilter; this.uriReferences = uriReferences; this.queue = queue; this.uriProcessor = new UriProcessor(); this.graphLogger = graphLogger; -<<<<<<< HEAD -======= - this.outDatedUris = outDatedUris; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + this.outDatedUriRetreiver = outDatedUriRetreiver; this.queue.open(); this.doesRecrawling = doesRecrawling; @@ -379,27 +222,23 @@ public FrontierImpl(UriNormalizer normalizer, timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = outDatedUris.getUriToRecrawl(); - System.out.println("Frontier uri to recrawl: " +urisToRecrawl); + List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); + System.out.println("Frontier uri to recrawl: " + urisToRecrawl); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } }, this.timerPeriod, this.timerPeriod); } } + public static long getGeneralRecrawlTime() { + return generalRecrawlTime; + } + @Override public List getNextUris() { -<<<<<<< HEAD - - // if(terminationCheck.shouldFrontierTerminate(this)) { - // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); - // } - -======= // if(terminationCheck.shouldFrontierTerminate(this)) { // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); // } ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 return queue.getNextUris(); } @@ -436,11 +275,7 @@ public void addNewUri(CrawleableUri uri) { knownUriFilter.add(uri, System.currentTimeMillis()); } else { LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " -<<<<<<< HEAD - + schemeUriFilter.getSchemes() + ". Will not added!"); -======= + schemeUriFilter.getSchemes() + ". Will not added!"); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 } } else { @@ -448,27 +283,9 @@ public void addNewUri(CrawleableUri uri) { } } - @Override public void crawlingDone(List uris) { LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); -<<<<<<< HEAD - - // List newUris = new ArrayList<>(uriMap.size()); - // for (CrawleableUri uri : uriMap.keySet()) { - // newUris.addAll(uriMap.get(uri)); - // knownUriFilter.add(uri, System.currentTimeMillis(), - // uri.getTimestampNextCrawl()); - // if (uriReferences != null) { - // uriReferences.add(uri, uriMap.get(uri)); - // } - // } - - // // If there is a graph logger, log the data - // if (graphLogger != null) { - // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); - // } -======= // List newUris = new ArrayList<>(uriMap.size()); // for (CrawleableUri uri : uriMap.keySet()) { // newUris.addAll(uriMap.get(uri)); @@ -482,10 +299,17 @@ public void crawlingDone(List uris) { // if (graphLogger != null) { // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); // } ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 // If we should give the crawled IPs to the queue - if (queue instanceof BlockingQueue) { - ((BlockingQueue) queue).markUrisAsAccessible(uris); + if (queue instanceof IpAddressBasedQueue) { + Set ips = new HashSet<>(); + InetAddress ip; + for (CrawleableUri uri : uris) { + ip = uri.getIpAddress(); + if (ip != null) { + ips.add(ip); + } + } + ips.forEach(_ip -> ((IpAddressBasedQueue) queue).markIpAddressAsAccessible(_ip)); } // send list of crawled URIs to the knownUriFilter for (CrawleableUri uri : uris) { @@ -505,10 +329,8 @@ public void crawlingDone(List uris) { @Override public int getNumberOfPendingUris() { - // TODO this implementation does not fit to the semantics of the method name - // since it returns the number of URI groups instead of the number of URIs - if (queue instanceof BlockingQueue) { - return ((BlockingQueue) queue).getNumberOfBlockedKeys(); + if (queue instanceof IpAddressBasedQueue) { + return ((IpAddressBasedQueue) queue).getNumberOfBlockedIps(); } else { return 0; } @@ -524,10 +346,6 @@ public void close() { timerRecrawling.cancel(); } - public static long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - /** * Getter for the {@link #queue}. * @@ -538,348 +356,3 @@ public UriQueue getQueue() { } } -<<<<<<< HEAD -======= - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - - /** - * {@link UriNormalizer} used to transform given URIs into a normal form. - */ - protected UriNormalizer normalizer; - - /** - * {@link KnownUriFilter} used to identify URIs that already have been crawled. - */ - protected KnownUriFilter knownUriFilter; - - protected KnownOutDatedUriFilter knownOutDatedUriFilter; - - /** - * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to - * identify URIs that already have been crawled. - */ - protected URIReferences uriReferences = null; - - /** - * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. - */ - protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); - /** - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - protected UriQueue queue; - /** - * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, - * SPARQL, DEREFERENCEABLE or UNKNOWN - */ - protected UriProcessor uriProcessor; - /** - * {@link GraphLogger} that can be added to log the crawled graph. - */ - protected GraphLogger graphLogger; - - /** - * Indicates whether recrawling is active. - */ - private boolean doesRecrawling; - - /** - * The timer that schedules the recrawling. - */ - private Timer timerRecrawling; - - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ - private static long generalRecrawlTime; - - /** - * Time interval(in milliseconds) at which the check for outdated uris is - * performed. - */ - private long timerPeriod; - - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME =18000 ; - /** - * Default value for {@link #timerPeriod}. - */ - private static final long DEFAULT_TIMER_PERIOD = 18000; - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod,KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod,knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod, UriHashCustodian uriHashCustodian, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, - timerPeriod, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling,KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, knownOutDatedUriFilter); - } - - /** - * Constructor. - * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - * @param knownOutDatedUriFilter - */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, - boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, KnownOutDatedUriFilter knownOutDatedUriFilter) { - this.normalizer = normalizer; - this.knownUriFilter = knownUriFilter; - this.uriReferences = uriReferences; - this.queue = queue; - this.uriProcessor = new UriProcessor(); - this.graphLogger = graphLogger; - this.knownOutDatedUriFilter = knownOutDatedUriFilter; - - this.queue.open(); - this.doesRecrawling = doesRecrawling; - this.timerPeriod = timerPeriod; - FrontierImpl.generalRecrawlTime = generalRecrawlTime; - - if (this.doesRecrawling) { - timerRecrawling = new Timer(); - timerRecrawling.schedule(new TimerTask() { - @Override - public void run() { - List urisToRecrawl = knownOutDatedUriFilter.getUriToRecrawl(); - System.out.println("Frontier uri to recrawl: " +urisToRecrawl); - urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); - } - }, this.timerPeriod, this.timerPeriod); - } - } - - @Override - public List getNextUris() { - -// if(terminationCheck.shouldFrontierTerminate(this)) { -// LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); -// } - - return queue.getNextUris(); - } - - @Override - public void addNewUris(List uris) { - for (CrawleableUri uri : uris) { - addNewUri(uri); - } - } - - @Override - public void addNewUri(CrawleableUri uri) { - // Normalize the URI - uri = normalizer.normalize(uri); - // After knownUriFilter uri should be classified according to - // UriProcessor - - if (knownUriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); - if (schemeUriFilter.isUriGood(uri)) { - LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); - // Make sure that the IP is known - try { - uri = this.uriProcessor.recognizeInetAddress(uri); - - } catch (UnknownHostException e) { - LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); - } - if (uri.getIpAddress() != null) { - queue.addUri(this.uriProcessor.recognizeUriType(uri)); - } else { - LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); - } - knownUriFilter.add(uri, System.currentTimeMillis()); - } else { - LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " - + schemeUriFilter.getSchemes() + ". Will not added!"); - } - - } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!"); - } - } - - @Override - public void crawlingDone(List uris) { - LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); - -// List newUris = new ArrayList<>(uriMap.size()); -// for (CrawleableUri uri : uriMap.keySet()) { -// newUris.addAll(uriMap.get(uri)); -// knownUriFilter.add(uri, System.currentTimeMillis(), uri.getTimestampNextCrawl()); -// if (uriReferences != null) { -// uriReferences.add(uri, uriMap.get(uri)); -// } -// } - -// // If there is a graph logger, log the data -// if (graphLogger != null) { -// graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); -// } - // If we should give the crawled IPs to the queue - if (queue instanceof IpAddressBasedQueue) { - Set ips = new HashSet<>(); - InetAddress ip; - for (CrawleableUri uri : uris) { - ip = uri.getIpAddress(); - if (ip != null) { - ips.add(ip); - } - } - ips.forEach(_ip -> ((IpAddressBasedQueue) queue).markIpAddressAsAccessible(_ip)); - } - // send list of crawled URIs to the knownUriFilter - for (CrawleableUri uri : uris) { - Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); - // If a recrawling is defined, check whether we can directly add it back to the - // queue - if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { - // Create a new uri object reusing only meta data that is useful - CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); - recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); - addNewUri(recrawlUri); - } else { - knownUriFilter.add(uri, System.currentTimeMillis()); - } - } - } - - @Override - public int getNumberOfPendingUris() { - if (queue instanceof IpAddressBasedQueue) { - return ((IpAddressBasedQueue) queue).getNumberOfBlockedIps(); - } else { - return 0; - } - } - - @Override - public boolean doesRecrawling() { - return doesRecrawling; - } - - @Override - public void close() { - timerRecrawling.cancel(); - } - - public static long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - - /** - * Getter for the {@link #queue}. - * - * @return The waiting queue for the URIs. - */ - public UriQueue getQueue() { - return queue; - } - -} ->>>>>>> 98250b4cbed5c441b6c05b561ee7c07c76aadbfa -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java deleted file mode 100644 index 8740cd832..000000000 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierQueryGenerator.java +++ /dev/null @@ -1,265 +0,0 @@ -package org.dice_research.squirrel.frontier.impl; -import java.util.Collection; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.jena.graph.Node; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.Query; -import org.apache.jena.query.QueryFactory; - -public class FrontierQueryGenerator { -<<<<<<< HEAD - /** -======= - /** ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - * The instance of the class QueryGenerator. - */ - private static final FrontierQueryGenerator instance = new FrontierQueryGenerator(); - String PREFIX= "PREFIX xsd: "; - @SuppressWarnings("unused") - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); - - private FrontierQueryGenerator() { - } - - /** - * Getter for {@link #instance}. - * -<<<<<<< HEAD - * @return instannce of the class. -======= - * @return instance of the class. ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - */ - public static FrontierQueryGenerator getInstance() { - return instance; - } - - /** - * Return an Add Query for the default uri and its triples. - * - * @param listBufferedTriples the given list of triples. - * @return The generated query. - */ - public String getAddQuery(Collection listBufferedTriples) { - return getAddQuery(null, listBufferedTriples, true); - } - - /** - * Return an Add Query for the given uri and its triples. - * - * @param graphId the graph id where the triples are stored. - * @param listBufferedTriples the given list of triples. - * @return The generated query. - */ - public String getAddQuery(String graphId, Collection listBufferedTriples) { - return getAddQuery(graphId, listBufferedTriples, false); - } - - /** - * Return an Add Query for the given uri or default graph and its triples. - * - * @param graphId the graph id where the triples are stored. - * @param listBufferedTriples the given list of triples. - * @param defaultGraph Identify if query is for the default graph. - * @return The generated query. - */ - public String getAddQuery(String graphId, Collection listBufferedTriples, boolean defaultGraph) { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("INSERT DATA { "); - if (!defaultGraph) { - stringBuilder.append("Graph <"); - stringBuilder.append(graphId); - stringBuilder.append("> { "); - } - for (Triple triple : listBufferedTriples) { - stringBuilder.append(formatNodeToString(triple.getSubject())); - stringBuilder.append(formatNodeToString(triple.getPredicate())); - stringBuilder.append(formatNodeToString(triple.getObject())); - stringBuilder.append(". "); - } - if (!defaultGraph) { - stringBuilder.append("} "); - } - stringBuilder.append("}"); - return stringBuilder.toString(); - } - - /** - * Return a time stamp query for the default graph. - * It will return triples with time stamp contained in the default graph. - * @return All triples with time stamp in the default graph. -<<<<<<< HEAD - */ - - public Query getOutdatedUrisQuery() { - return getOutdatedUrisQuery(null, true); - } - public Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("PREFIX sq: \n" + - "PREFIX prov: \n" + - "PREFIX xsd: " - + "SELECT ?uri WHERE { \n "); - // + "SELECT ?uri WHERE { \n "); - if (!defaultGraph) { - stringBuilder.append("GRAPH <"); - stringBuilder.append(graphID); - stringBuilder.append("> { "); - } - stringBuilder.append("{\n" + - "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + - "WHERE{\n" + - "\n" + - " {\n" + - " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + - " WHERE\n" + - " { \n" + - " ?s sq:crawled ?uri ;\n" + - " prov:endedAtTime ?timestamp.\n" + - "\n" + - " }\n" + - " GROUP BY ?uri\n" + - " } \n" + - "}\n" + - "}\n" + - "FILTER(?diff > \"18000\"^^xsd:double)\n" + - ""); - if (!defaultGraph) { - stringBuilder.append("}"); - } - - // stringBuilder.append("}GROUP BY ?uri"); - stringBuilder.append("}"); - - Query query = QueryFactory.create(stringBuilder.toString()); - return query; - } - - public Query getSelectQuery() { - return getSelectQuery(null, true); - } - /** - * Return a select query for the given graphID or default graph. - * It will return all triples contained in the graph. - * @return All triples contained in the default graph. - * @param graphID The id of the graph from which you want to select. - * @param defaultGraph Identify if query is for the default graph - * @return All triples contained in the graph. - */ - public Query getSelectQuery(String graphID, boolean defaultGraph) { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("SELECT ?subject ?predicate ?object WHERE { "); - if (!defaultGraph) { - stringBuilder.append("GRAPH <"); - stringBuilder.append(graphID); - stringBuilder.append("> { "); - } - stringBuilder.append("?subject ?predicate ?object "); - if (!defaultGraph) { - stringBuilder.append("} "); - } - stringBuilder.append("}"); - Query query = QueryFactory.create(stringBuilder.toString()); - return query; - } - - /** - * Return a select query for the given graphID. - * It will return all triples contained in the graph. - * @param graphID The id of the graph from which you want to select. - * @return All triples contained in the graph. - */ - public Query getSelectQuery(String graphID) { - return getSelectQuery(graphID, false); - } - - /** - * Formats the node for a query - * - * @param node The node which should formated - * @return a robust representation of the node - *

- * Note: Should be updated in relation to the robustness of parsing. - */ -======= - * @param - */ - - public Query getOutdatedUrisQuery() { - return getOutdatedUrisQuery(null,true); - } - public Query getOutdatedUrisQuery(String graphUri, boolean defaultGraph) { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("PREFIX sq: \n" + - "PREFIX prov: \n" + - "PREFIX xsd: " - + "SELECT ?uri WHERE { \n "); - // + "SELECT ?uri WHERE { \n "); - if (graphUri == null) { - stringBuilder.append("GRAPH <"); - stringBuilder.append(defaultGraph); - stringBuilder.append("> { "); - } - stringBuilder.append("{\n" + - "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + - "WHERE{\n" + - "\n" + - " {\n" + - " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + - " WHERE\n" + - " { \n" + - " ?s sq:crawled ?uri ;\n" + - " prov:endedAtTime ?timestamp.\n" + - "\n" + - " }\n" + - " GROUP BY ?uri\n" + - " } \n" + - "}\n" + - "}\n" + - "FILTER(?diff > \"18000\"^^xsd:double)\n" + - ""); - stringBuilder.append("}"); - - Query query = QueryFactory.create(stringBuilder.toString()); - return query; - } - @Deprecated ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 - public static String formatNodeToString(Node node) { - StringBuilder stringBuilder = new StringBuilder(); - if (node.isURI()) { - stringBuilder.append("<"); - //Should possibly be further improved - stringBuilder.append(node.getURI().replace(" ","")); - stringBuilder.append(">"); - } else if (node.isBlank()) { - stringBuilder.append("_:"); - //Should possibly be further improved - String label = node.getBlankNodeId().getLabelString().replace(":", ""); - if (label.startsWith("-")) { - label = label.substring(1); - } - stringBuilder.append(label); - } else if (node.isLiteral()) { - stringBuilder.append("\""); - //Should possibly be further improved - stringBuilder.append(node.getLiteral().getLexicalForm().replace("\n", "").replace("\"", "'").replace("\r", "")); - stringBuilder.append("\""); - if (node.getLiteralLanguage() != null && !node.getLiteralLanguage().isEmpty()) { - stringBuilder.append("@"); - stringBuilder.append(node.getLiteralLanguage()); - } else if (node.getLiteralDatatype() != null) { - stringBuilder.append("^^"); - stringBuilder.append("<"); - stringBuilder.append(node.getLiteralDatatype().getURI()); - stringBuilder.append(">"); - } - } - stringBuilder.append(" "); - return stringBuilder.toString(); - } -} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java new file mode 100644 index 000000000..4ee2fbe12 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -0,0 +1,78 @@ +package org.dice_research.squirrel.frontier.recrawling; + +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FrontierQueryGenerator { + + private static final FrontierQueryGenerator instance = new FrontierQueryGenerator(); + + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); + + private FrontierQueryGenerator() { + } + + /** + * Getter for {@link #instance}. + */ + public static FrontierQueryGenerator getInstance() { + return instance; + } + + + /** + * Return a time stamp query for the default graph. + * It will return triples with time stamp contained in the default graph. + * + * @return All triples with time stamp in the default graph. + */ + + public Query getOutdatedUrisQuery() { + return getOutdatedUrisQuery(null, true); + } + + public Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("PREFIX sq: \n" + + "PREFIX prov: \n" + + "PREFIX xsd: " + + "SELECT ?uri WHERE { \n "); + // + "SELECT ?uri WHERE { \n "); + if (!defaultGraph) { + stringBuilder.append("GRAPH <"); + stringBuilder.append(graphID); + stringBuilder.append("> { "); + } + stringBuilder.append("{\n" + + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + + "WHERE{\n" + + "\n" + + " {\n" + + " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + + " WHERE\n" + + " { \n" + + " ?s sq:crawled ?uri ;\n" + + " prov:endedAtTime ?timestamp.\n" + + "\n" + + " }\n" + + " GROUP BY ?uri\n" + + " } \n" + + "}\n" + + "}\n" + + "FILTER(?diff > \"18000\"^^xsd:double)\n" + + ""); + if (!defaultGraph) { + stringBuilder.append("}"); + } + + // stringBuilder.append("}GROUP BY ?uri"); + stringBuilder.append("}"); + + Query query = QueryFactory.create(stringBuilder.toString()); + return query; + } + + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java similarity index 91% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index 723645296..c97b5201b 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -1,9 +1,4 @@ -package org.dice_research.squirrel.configurator; - -import java.net.URI; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.List; +package org.dice_research.squirrel.frontier.recrawling; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; @@ -23,13 +18,18 @@ import org.apache.jena.rdf.model.RDFNode; import org.apache.jena.sparql.core.DatasetDescription; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; -import org.dice_research.squirrel.frontier.impl.FrontierQueryGenerator; +import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + @SuppressWarnings("deprecation") -public class SparqlhostConnector implements OutDatedUris { +public class SparqlhostConnector implements OutDatedUriRetreiver { private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); @@ -93,7 +93,7 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { @Override public List getUriToRecrawl() { - SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + // SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); System.out.println(getOutdatedUrisQuery); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); @@ -112,5 +112,9 @@ public List getUriToRecrawl() { } + @Override + public void close() throws IOException { + getUriToRecrawl(); + } } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index afa29685d..7dadca235 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -10,19 +10,14 @@ import java.util.List; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; -<<<<<<< HEAD -import org.dice_research.squirrel.configurator.SparqlConfiguration; -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; -import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; -======= -import org.dice_research.squirrel.configurator.SparqlhostConnector; + import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + +import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; + import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; @@ -30,21 +25,19 @@ import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; @SuppressWarnings("deprecation") public class FrontierImplTest { -<<<<<<< HEAD + private static FrontierImpl frontier; private static MongoDBIpBasedQueue queue; private static MongoDBKnowUriFilter filter; private static List uris = new ArrayList(); private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private OutDatedUris outDatedUris = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + private OutDatedUriRetreiver outDatedUriRetreiver = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); @Before public void setUp() throws Exception { @@ -53,7 +46,7 @@ public void setUp() throws Exception { queue = new MongoDBIpBasedQueue("localhost", 58027); filter.open(); queue.open(); - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUris); + frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUriRetreiver); uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE)); uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), @@ -62,12 +55,10 @@ public void setUp() throws Exception { @Test public void getNextUris() throws Exception { -<<<<<<< HEAD + queue.addUri(uris.get(1)); -======= - queue.addCrawleableUri(uris.get(1)); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + // queue.addCrawleableUri(uris.get(1)); List nextUris = frontier.getNextUris(); List assertion = new ArrayList(); assertion.add(uris.get(1)); @@ -205,186 +196,5 @@ public void tearDown() throws Exception { p.waitFor(); } } -<<<<<<< HEAD -======= - private static FrontierImpl frontier; - private static MongoDBIpBasedQueue queue; - private static MongoDBKnowUriFilter filter; - private static List uris = new ArrayList(); - private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private KnownOutDatedUriFilter knownOutDatedUriFilter = SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - - @Before - public void setUp() throws Exception { - - - MongoDBBasedTest.setUpMDB(); - - filter = new MongoDBKnowUriFilter("localhost", 58027); - queue = new MongoDBIpBasedQueue("localhost", 58027); - filter.open(); - queue.open(); - - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true, 18000, 18000, null, knownOutDatedUriFilter); - - uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), - UriType.DEREFERENCEABLE)); - uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), - UriType.DEREFERENCEABLE)); - } - - @Test - public void getNextUris() throws Exception { - queue.addCrawleableUri(uris.get(1)); - - List nextUris = frontier.getNextUris(); - List assertion = new ArrayList(); - assertion.add(uris.get(1)); - - assertEquals("Should be dbr:New_York", assertion, nextUris); - } - - @Test - public void addNewUris() throws Exception { - queue.purge(); - filter.purge(); - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - List assertion = new ArrayList(); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), - InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("194.109.129.58"), - UriType.DEREFERENCEABLE)); - assertEquals("Should be the same as uris array", assertion, nextUris); - } - - @Test - public void addNewUri() throws Exception { - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), null, UriType.UNKNOWN); - frontier.addNewUri(uri_1); - List nextUris = frontier.getNextUris(); - List assertion = new ArrayList<>(); - assertion.add(cuf.create(new URI("http://dbpedia.org/resource/Tom_Lazarus"), - InetAddress.getByName("194.109.129.58"), UriType.DEREFERENCEABLE)); - assertEquals(assertion, nextUris); - } - - @Test - public void crawlingDone() throws Exception { - List crawledUris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/New_York"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/Moscow"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - - crawledUris.add(uri_1); - crawledUris.add(uri_2); - - // frontier.addNewUris(crawledUris); - // filter.add(uri_1, 100); - - frontier.crawlingDone(crawledUris); - assertFalse("uri_1 has been already crawled", frontier.knownUriFilter.isUriGood(uri_1)); - } - - @Test - public void getNumberOfPendingUris() throws Exception { - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - int numberOfPendingUris = frontier.getNumberOfPendingUris(); - assertEquals(1, numberOfPendingUris); - - numberOfPendingUris = frontier.getNumberOfPendingUris(); - assertEquals(2, nextUris.size()); - } - - /* - * see https://github.com/dice-group/Squirrel/issues/47 - */ - //@Test - public void simlpeRecrawling() throws Exception { - // Add the URIs to the frontier - List uris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri"), - InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE); - uris.add(uri_1); - uris.add(uri_2); - - frontier.addNewUris(uris); - - List nextUris = frontier.getNextUris(); - for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); - } - for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); - } - - // Set the first URI as recrawlable - for (CrawleableUri uri : nextUris) { - if(uri.getUri().equals(uri_1.getUri())) { - uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); - } - } - - frontier.crawlingDone(uris); - - uris.add(uri_1); - uris.add(uri_2); - - nextUris = frontier.getNextUris(); - Assert.assertNotNull(nextUris); - assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(1, nextUris.size()); - assertFalse("uri_2 has been found but was not expected", nextUris.contains(uri_2)); - } - - @Test - public void RecrawlingTest() throws Exception { - // Add the URIs to the frontier - List uris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled")); - uri_1.addData("endedAtTime", "2019-07-06T17:04:02.864Z"); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri")); - uri_2.addData("endedAtTime", "2019-07-06T19:38:02.864Z"); - uris.add(uri_1); - uris.add(uri_2); - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); - } - for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); - } - - // Set the first URI as recrawlable - for (CrawleableUri uri : nextUris) { - if(uri.getUri().equals(uri_1.getUri())) { - uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); - } - } - Assert.assertNotNull(nextUris); - assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(2, nextUris.size()); - } - - @After - public void tearDown() throws Exception { - filter.purge(); - queue.purge(); - String rethinkDockerStopCommand = "docker stop squirrel-test-frontierimpl"; - Process p = Runtime.getRuntime().exec(rethinkDockerStopCommand); - p.waitFor(); - String rethinkDockerRmCommand = "docker rm squirrel-test-frontierimpl"; - p = Runtime.getRuntime().exec(rethinkDockerRmCommand); - p.waitFor(); - } -} ->>>>>>> 98250b4cbed5c441b6c05b561ee7c07c76aadbfa -======= ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index e5d7bbee1..94e3e73d8 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -1,14 +1,8 @@ package org.dice_research.squirrel.seed.generator.impl; -<<<<<<< HEAD -import org.dice_research.squirrel.configurator.SparqlConfiguration; -import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownOutDatedUriFilter; -======= -import org.dice_research.squirrel.configurator.SparqlhostConnector; + import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUris; ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.impl.FrontierImpl; @@ -28,19 +22,14 @@ public class CkanSeedGeneratorImplTest extends TestCase { private CkanSeedGeneratorImpl ckanSeedGenerator; private IpAddressBasedQueue queue; private Frontier frontier; -<<<<<<< HEAD - private KnownOutDatedUriFilter knownOutDatedUriFilter = SparqlConfiguration.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - public void setUp() { - queue = new InMemoryQueue(); - frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,knownOutDatedUriFilter); -======= - private OutDatedUris outDatedUris = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + + // private OutDatedUriRetreiver outDatedUriRetreiver = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); public void setUp() { queue = new InMemoryQueue(); - frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue, outDatedUris); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 + frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,null); + ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); } From d4130598c24ca33c56e12b9cb79cf6bea25fe025 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 3 Nov 2019 11:30:22 +0100 Subject: [PATCH 018/102] solving merge errors and bean references error --- bin/docker-compose-sparql.yml | 200 ---- docker-compose.yml | 30 +- pom.xml | 924 +++++++++--------- spring-config/frontier-context.xml | 58 +- spring-config/worker-context-sparql.xml | 161 ++- spring-config/worker-context.xml | 118 ++- .../components/FrontierComponent.java | 57 +- .../components/FrontierComponentStarter.java | 6 +- .../frontier/impl/ExtendedFrontierImpl.java | 9 +- .../squirrel/frontier/impl/FrontierImpl.java | 8 +- .../recrawling/FrontierQueryGenerator.java | 17 +- .../recrawling}/OutDatedUriRetreiver.java | 2 +- .../recrawling/SparqlhostConnector.java | 5 +- .../frontier/impl/FrontierImplTest.java | 2 +- .../squirrel/components/WorkerComponent.java | 4 + .../components/WorkerComponentConfig.java | 2 +- .../components/WorkerComponentStarter.java | 2 +- .../squirrel/fetcher/http/HTTPFetcher.java | 4 +- 18 files changed, 681 insertions(+), 928 deletions(-) delete mode 100644 bin/docker-compose-sparql.yml rename {squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter => squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling}/OutDatedUriRetreiver.java (88%) diff --git a/bin/docker-compose-sparql.yml b/bin/docker-compose-sparql.yml deleted file mode 100644 index 1cebd57d6..000000000 --- a/bin/docker-compose-sparql.yml +++ /dev/null @@ -1,200 +0,0 @@ -version: "2" - -services: - #debugger: - # image: sjourdan/toolbox - # container_name: debugger - # networks: - # - squirrel:latest - # dns: 8.8.8.8 - # command: nc -l 50070 - - frontier: - image: squirrel.frontier:latest - container_name: frontier - environment: - - HOBBIT_RABBIT_HOST=rabbit - - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt -<<<<<<< HEAD - # - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml -======= - - FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context.xml ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - SEED_FILE=/var/squirrel/seeds.txt - - SPARQL_URL=http://sparqlhost:3030/Metadata/query - - SPARQL_HOST_USER=admin - - SPARQL_HOST_PASSWD=pw123 - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - - MDB_CONNECTION_TIME_OUT=5000 - - MDB_SOCKET_TIME_OUT=10000 - - MDB_SERVER_TIME_OUT=10000 - - QUEUE_FILTER_PERSIST=true - - COMMUNICATION_WITH_WEBSERVICE=false - - VISUALIZATION_OF_CRAWLED_GRAPH=false - - JVM_ARGS=-Xmx8g - volumes: - - ./data/frontier:/var/squirrel/data - - ./seed/seeds.txt:/var/squirrel/seeds.txt:ro - - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro - command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter - -# virtuosohost: -# image: openlink/virtuoso-opensource-7:latest -# container_name: virtuosohost -# ports: -# - "8890:8890" -# volumes: -# - ./data/sparqlhost/sparqlhost_data:/virtuoso -# environment: -# - Driver=/usr/local/lib/virtodbc_32.so -# - DBA_PASSWORD=123pwd - - sparqlhost: - image: stain/jena-fuseki - container_name: sparqlhost - ports: - - "3030:3030" - volumes: - - ./data/sparqlhost/sparqlhost_data:/fuseki - environment: -<<<<<<< HEAD - - ADMIN_PASSWORD=pw123 - - JVM_ARGS=-Xmx2g -======= - - Driver=/usr/local/lib/virtodbc_32.so - - DBA_PASSWORD=pw123 - -# sparqlhost: -# image: stain/jena-fuseki -# container_name: sparqlhost -# ports: -# - "3030:3030" -# volumes: -# - ./data/sparqlhost/sparqlhost_data:/fuseki -# environment: -# - ADMIN_PASSWORD=pw123 -# - JVM_ARGS=-Xmx2g ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - mongodb: - image: mongo:4.0.0 - volumes: - - ./data/mongodb:/data - ports: - - "27017:27017" - - rethinkdb: - image: rethinkdb:2.3.5 - volumes: - - ./data/rethinkdb:/data - ports: - - "8080:8080" - command: rethinkdb --bind all - - # message bus - rabbit: - image: rabbitmq:management - container_name: rabbit - hostname: rabbit - ports: - - "8081:15672" - # Forwarding the port for testing - - "5672:5672" - - worker1: - image: squirrel.worker:latest - container_name: worker1 - environment: - - HOBBIT_RABBIT_HOST=rabbit - - OUTPUT_FOLDER=/var/squirrel/data - - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - SPARQL_HOST_PASSWD=pw123 - - DEDUPLICATION_ACTIVE=false - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - - JVM_ARGS=-Xmx8g - volumes: - - ./data/worker1:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config - command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - - worker2: - image: squirrel.worker:latest - container_name: worker2 - environment: - - HOBBIT_RABBIT_HOST=rabbit - - OUTPUT_FOLDER=/var/squirrel/data - - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - SPARQL_HOST_PASSWD=pw123 - - DEDUPLICATION_ACTIVE=false - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - - JVM_ARGS=-Xmx8g - volumes: - - ./data/worker2:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config - command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - - worker3: - image: squirrel.worker:latest - container_name: worker3 - environment: - - HOBBIT_RABBIT_HOST=rabbit - - OUTPUT_FOLDER=/var/squirrel/data - - HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml -<<<<<<< HEAD - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparql.xml - - SPARQL_URL=http://sparqlhost:3030/Metadata/update - - SPARQL_HOST_USER=admin -======= - - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/worker-context-sparql.xml - - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - - SPARQL_HOST_USER=dba ->>>>>>> 26bb54a0debdb8e8695329a0c0659fb85a0a1858 - - SPARQL_HOST_PASSWD=pw123 - - DEDUPLICATION_ACTIVE=true - - MDB_HOST_NAME=mongodb - - MDB_PORT=27017 - - JVM_ARGS=-Xmx8g - volumes: - - ./data/worker3:/var/squirrel/data - - ./yaml:/var/squirrel/yaml - - ./spring-config:/var/squirrel/spring-config - command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - -# deduplicator: -# image: squirrel.deduplication -# container_name: deduplicator -# environment: -# DEDUPLICATION_ACTIVE: "true" -# HOBBIT_RABBIT_HOST: rabbit -# OUTPUT_FOLDER: /var/squirrel/data -# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml -# MDB_HOST_NAME: mongodb -# MDB_PORT: 27017 -# SPARQL_HOST_NAME: sparqlhost -# SPARQL_HOST_PORT: 3030 -# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672" -#volumes: -# - ./data/deduplicator:/var/squirrel/data \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 1475bb58c..2213ed4f2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -107,18 +107,18 @@ services: - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - deduplicator: - image: squirrel - container_name: deduplicator - environment: - DEDUPLICATION_ACTIVE: "true" - HOBBIT_RABBIT_HOST: rabbit - OUTPUT_FOLDER: /var/squirrel/data - MDB_HOST_NAME: mongodb - MDB_PORT: 27017 - SPARQL_HOST_NAME: sparqlhost - SPARQL_HOST_PORT: 3030 - SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" - volumes: - - ./data/deduplicator:/var/squirrel/data - command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent +# deduplicator: +# image: squirrel +# container_name: deduplicator +# environment: +# DEDUPLICATION_ACTIVE: "true" +# HOBBIT_RABBIT_HOST: rabbit +# OUTPUT_FOLDER: /var/squirrel/data +# MDB_HOST_NAME: mongodb +# MDB_PORT: 27017 +# SPARQL_HOST_NAME: sparqlhost +# SPARQL_HOST_PORT: 3030 +# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" +# volumes: +# - ./data/deduplicator:/var/squirrel/data +# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent diff --git a/pom.xml b/pom.xml index b9eb5b85f..057e6fae9 100644 --- a/pom.xml +++ b/pom.xml @@ -1,462 +1,462 @@ - - - 4.0.0 - org.dice-research - squirrel - 0.4.0 - pom - 2017 - Squirrel - - - - - AGPL 3.0 - http://www.gnu.org/licenses/agpl-3.0.txt - - - - - - - m.roeder - Michael Röder - michael.roeder@uni-paderborn.de - - - g.souza - Geraldo Souza - gsjunior@mail.uni-paderborn.de - - - - - - squirrel.web-api - squirrel.web - squirrel.api - squirrel.deduplication - squirrel.frontier - squirrel.mockup - squirrel.worker - - - - UTF-8 - 1.8 - 1.8 - 1.7.10 - 0.4.2 - 4.4.11 - 4.5.7 - - - - - - maven.aksw.internal - AKSW Internal Release Repository - http://maven.aksw.org/repository/internal/ - - - maven.aksw.snapshots - University Leipzig, AKSW Maven2 Repository - http://maven.aksw.org/repository/snapshots - - - spring-releases - https://repo.spring.io/libs-release - - - - - - spring-releases - https://repo.spring.io/libs-release - - - - - - - org.dice-research - squirrel.api - ${project.version} - - - org.dice-research - squirrel.web-api - ${project.version} - - - org.dice-research - squirrel.mockup - ${project.version} - - - - org.hobbit - core - 1.0.12 - - - org.apache.httpcomponents - httpclient - - - org.apache.httpcomponents - httpclient-cache - - - - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - 2.6.0 - - - com.fasterxml.jackson.core - jackson-databind - 2.6.0 - - - - - org.apache.tika - tika-core - 1.17 - - - - - - org.jsoup - jsoup - 1.11.2 - - - - - - - - - commons-cli - commons-cli - 1.2 - - - - - org.apache.commons - commons-csv - 1.7 - - - - org.apache.commons - commons-lang3 - 3.4 - - - - org.apache.commons - commons-compress - 1.15 - - - - commons-net - commons-net - 3.6 - - - - commons-io - commons-io - 2.6 - - - - - - - org.springframework - spring-context - 5.0.6.RELEASE - - - - org.springframework - spring-jdbc - 5.0.6.RELEASE - - - - org.springframework - spring-test - 5.0.6.RELEASE - - - - - - com.github.crawler-commons - crawler-commons - 0.7 - - - - - org.aksw.jena-sparql-api - jena-sparql-api-core - 3.1.0-2-SNAPSHOT - - - org.apache.httpcomponents - httpclient - - - - - - - - org.rdfhdt - hdt-java-package - 1.1 - pom - - - junit - junit-dep - - - - - org.rdfhdt - hdt-jena - 1.1 - - - org.rdfhdt - hdt-api - 1.1 - - - - - - - net.lingala.zip4j - zip4j - 1.3.2 - - - - - - com.carrotsearch - hppc - 0.5.3 - - - - - org.json - json - 20140107 - - - - - - org.mongodb - mongodb-driver - 3.6.4 - - - - - - com.rethinkdb - rethinkdb-driver - 2.3.3 - - - - - - org.hsqldb - hsqldb - 2.3.2 - - - - - org.simpleframework - simple - 5.1.6 - - - - org.xerial.snappy - snappy-java - 1.1.4 - - - - - - junit - junit - 4.12 - test - - - - - - - org.slf4j - slf4j-api - ${slf4j-version} - - - - org.slf4j - slf4j-log4j12 - ${slf4j-version} - - - log4j - apache-log4j-extras - 1.2.17 - - - - org.slf4j - jul-to-slf4j - ${slf4j-version} - - - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.5.1 - - ${maven.compiler.source} - ${maven.compiler.target} - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.1 - - private - true - false - - - - package - - jar - - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - package - - jar - - - - - - - com.mycila - license-maven-plugin - 2.11 - -

com/mycila/maven/plugin/license/templates/GPL-3.txt
- - The Data Science Group (DICE, UPB) - axel.ngonga@upb.de - - - **/README - **/LICENSE - Makefile - Dockerfile - *.sh - **/logstash.conf - src/test/resources/** - src/main/resources/** - - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - ${maven.compile.source} - ${maven.compile.target} - - - - - - - - package - - shade - - - - - - - - \ No newline at end of file + + + 4.0.0 + org.dice-research + squirrel + 0.4.0 + pom + 2017 + Squirrel + + + + + AGPL 3.0 + http://www.gnu.org/licenses/agpl-3.0.txt + + + + + + + m.roeder + Michael Röder + michael.roeder@uni-paderborn.de + + + g.souza + Geraldo Souza + gsjunior@mail.uni-paderborn.de + + + + + + squirrel.web-api + squirrel.web + squirrel.api + squirrel.deduplication + squirrel.frontier + squirrel.mockup + squirrel.worker + + + + UTF-8 + 1.8 + 1.8 + 1.7.10 + 0.4.2 + 4.4.11 + 4.5.7 + + + + + + maven.aksw.internal + AKSW Internal Release Repository + http://maven.aksw.org/repository/internal/ + + + maven.aksw.snapshots + University Leipzig, AKSW Maven2 Repository + http://maven.aksw.org/repository/snapshots + + + spring-releases + https://repo.spring.io/libs-release + + + + + + spring-releases + https://repo.spring.io/libs-release + + + + + + + org.dice-research + squirrel.api + ${project.version} + + + org.dice-research + squirrel.web-api + ${project.version} + + + org.dice-research + squirrel.mockup + ${project.version} + + + + org.hobbit + core + 1.0.12 + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpclient-cache + + + + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.6.0 + + + com.fasterxml.jackson.core + jackson-databind + 2.6.0 + + + + + org.apache.tika + tika-core + 1.17 + + + + + + org.jsoup + jsoup + 1.11.2 + + + + + + + + + commons-cli + commons-cli + 1.2 + + + + + org.apache.commons + commons-csv + 1.7 + + + + org.apache.commons + commons-lang3 + 3.4 + + + + org.apache.commons + commons-compress + 1.15 + + + + commons-net + commons-net + 3.6 + + + + commons-io + commons-io + 2.6 + + + + + + + org.springframework + spring-context + 5.0.6.RELEASE + + + + org.springframework + spring-jdbc + 5.0.6.RELEASE + + + + org.springframework + spring-test + 5.0.6.RELEASE + + + + + + com.github.crawler-commons + crawler-commons + 0.7 + + + + + org.aksw.jena-sparql-api + jena-sparql-api-core + 3.1.0-2-SNAPSHOT + + + org.apache.httpcomponents + httpclient + + + + + + + + org.rdfhdt + hdt-java-package + 1.1 + pom + + + junit + junit-dep + + + + + org.rdfhdt + hdt-jena + 1.1 + + + org.rdfhdt + hdt-api + 1.1 + + + + + + + net.lingala.zip4j + zip4j + 1.3.2 + + + + + + com.carrotsearch + hppc + 0.5.3 + + + + + org.json + json + 20140107 + + + + + + org.mongodb + mongodb-driver + 3.6.4 + + + + + + com.rethinkdb + rethinkdb-driver + 2.3.3 + + + + + + org.hsqldb + hsqldb + 2.3.2 + + + + + org.simpleframework + simple + 5.1.6 + + + + org.xerial.snappy + snappy-java + 1.1.4 + + + + + + junit + junit + 4.12 + test + + + + + + + org.slf4j + slf4j-api + ${slf4j-version} + + + + org.slf4j + slf4j-log4j12 + ${slf4j-version} + + + log4j + apache-log4j-extras + 1.2.17 + + + + org.slf4j + jul-to-slf4j + ${slf4j-version} + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.5.1 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.1 + + private + true + false + + + + package + + jar + + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + package + + jar + + + + + + + com.mycila + license-maven-plugin + 2.11 + +
com/mycila/maven/plugin/license/templates/GPL-3.txt
+ + The Data Science Group (DICE, UPB) + axel.ngonga@upb.de + + + **/README + **/LICENSE + Makefile + Dockerfile + *.sh + **/logstash.conf + src/test/resources/** + src/main/resources/** + +
+
+
+ + + + org.apache.maven.plugins + maven-shade-plugin + 2.4.3 + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + ${maven.compile.source} + ${maven.compile.target} + + + + + + + + package + + shade + + + + + + +
+
diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 2b1d8dc69..b2eec1793 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -1,53 +1,57 @@ + http://www.springframework.org/schema/context/spring-context.xsd + http://www.springframework.org/schema/tx + http://www.springframework.org/schema/tx/spring-tx.xsd"> + base-package="org.dice_research.squirrel" /> - - + + --> + class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer" /> - - - - + + + + - - - + + + - - - - - - + + diff --git a/spring-config/worker-context-sparql.xml b/spring-config/worker-context-sparql.xml index 393e266ab..849697b9d 100644 --- a/spring-config/worker-context-sparql.xml +++ b/spring-config/worker-context-sparql.xml @@ -1,11 +1,11 @@ - @@ -28,7 +28,7 @@ + class="org.dice_research.squirrel.worker.impl.WorkerImpl"> @@ -38,138 +38,137 @@ + value="#{systemEnvironment['OUTPUT_FOLDER']}/log" /> - - - - - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + class="org.dice_research.squirrel.fetcher.manage.SimpleOrderedFetcherManager"> - - - - - - - - + + class="org.dice_research.squirrel.fetcher.http.HTTPFetcher" /> + class="org.dice_research.squirrel.fetcher.ftp.FTPFetcher" /> + class="org.dice_research.squirrel.fetcher.sparql.SparqlBasedFetcher" /> + class="org.dice_research.squirrel.fetcher.sparql.SparqlDatasetFetcher"> - + + class="org.dice_research.squirrel.fetcher.ckan.java.SimpleCkanFetcher" /> + value="#{systemEnvironment['OUTPUT_FOLDER']}" /> - + --> - - + - - - + class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer" /> + class="org.dice_research.squirrel.collect.SqlBasedUriCollector"> + class="org.dice_research.squirrel.robots.RobotsManagerImpl"> + class="crawlercommons.fetcher.http.SimpleHttpFetcher"> + class="crawlercommons.fetcher.http.UserAgent"> diff --git a/spring-config/worker-context.xml b/spring-config/worker-context.xml index 0752b6049..fffd02a77 100644 --- a/spring-config/worker-context.xml +++ b/spring-config/worker-context.xml @@ -16,82 +16,77 @@ + base-package="org.dice_research.squirrel" /> - + - - - - - - - - - + + + + + + + + + - + value="#{systemEnvironment['OUTPUT_FOLDER']}/log" /> + - + - - - - - - - + + + + + --> - - + + - + - + - + - + - + - - + + - + - - - - - - - + + + + + + class="org.dice_research.squirrel.fetcher.http.HTTPFetcher" /> + class="org.dice_research.squirrel.fetcher.ftp.FTPFetcher" /> + class="org.dice_research.squirrel.fetcher.sparql.SparqlBasedFetcher" /> @@ -126,54 +117,55 @@ + class="org.dice_research.squirrel.fetcher.ckan.java.SimpleCkanFetcher" /> + + value="#{systemEnvironment['OUTPUT_FOLDER']}" /> - + ref="outputFolderBean" /> + - - + class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer" /> - - + + - + - + - - - + + + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 67016b77b..fd35e4c7c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -4,16 +4,12 @@ import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.configurator.MongoConfiguration; -import org.dice_research.squirrel.configurator.SeedConfiguration; -import org.dice_research.squirrel.configurator.WebConfiguration; -import org.dice_research.squirrel.configurator.WhiteListConfiguration; +import org.dice_research.squirrel.configurator.*; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.UriSeedReader; import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -22,6 +18,7 @@ import org.dice_research.squirrel.frontier.ExtendedFrontier; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.impl.*; +import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.rabbit.RPCServer; @@ -31,7 +28,6 @@ import org.dice_research.squirrel.rabbit.msgs.UriSet; import org.dice_research.squirrel.rabbit.msgs.UriSetRequest; import org.dice_research.squirrel.worker.AliveMessage; -import org.dice_research.squirrel.worker.WorkerInfo; import org.hobbit.core.components.AbstractComponent; import org.hobbit.core.data.RabbitQueue; import org.hobbit.core.rabbit.DataReceiver; @@ -40,13 +36,11 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; - import java.io.Closeable; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.*; -import java.util.Map.Entry; import java.util.concurrent.Semaphore; @Component @@ -59,7 +53,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; - @Qualifier("sparqlBean") + @Qualifier("queueBean") @Autowired protected UriQueue queue; protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; @@ -67,7 +61,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("knowUriFilterBean") @Autowired private KnownUriFilter knownUriFilter; - OutDatedUriRetreiver outDatedUriRetreiver; + private OutDatedUriRetreiver outDatedUriRetreiver; private URIReferences uriReferences = null; private Frontier frontier; private RabbitQueue rabbitQueue; @@ -76,10 +70,6 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Autowired private Serializer serializer; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; - - - // public static final boolean RECRAWLING_ACTIVE = true; - private Map hasUrisToCrawl; @Override @@ -88,7 +78,7 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - // SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); hasUrisToCrawl = new HashMap(); if (mongoConfiguration != null) { @@ -144,7 +134,7 @@ public void init() throws Exception { @Override public void run() throws Exception { - TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex, this.workerGuard); + TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); Timer timer = new Timer(); timer.schedule(terminatorTask, 5000, 5000); terminationMutex.acquire(); @@ -157,13 +147,7 @@ public void close() throws IOException { if (receiver != null) // Force the receiver to close receiver.close(); -/* -<<<<<<< HEAD -// receiver.closeWhenFinished(); -======= // receiver.closeWhenFinished(); ->>>>>>> bb00ad4b8e0cfdb89738f43afc01ce482e016bd6 -*/ if (queue != null) queue.close(); @@ -236,10 +220,6 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue uris == null ? "null" : Integer.toString(uris.size())); handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); if (uris != null && uris.size() > 0) { - - workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), - uris); - hasUrisToCrawl.put(uriSetRequest.getWorkerId(), true); workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), uris); @@ -256,10 +236,6 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue protected void processSeedFile(String seedFile) { try { - List listSeeds = new UriSeedReader(seedFile).getUris(); - if (!listSeeds.isEmpty()) - frontier.addNewUris(listSeeds); - List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); } catch (Exception e) { @@ -281,33 +257,20 @@ public WorkerGuard getWorkerGuard() { return workerGuard; } - private static class TerminatorTask extends TimerTask { + private class TerminatorTask extends TimerTask { private UriQueue queue; private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); private Semaphore terminationMutex; - private WorkerGuard workerGuard; - public TerminatorTask(UriQueue queue, Semaphore terminationMutex, WorkerGuard workerGuard) { + public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { this.queue = queue; this.terminationMutex = terminationMutex; - this.workerGuard = workerGuard; } @Override public void run() { - - Map mapWorkers = this.workerGuard.getMapWorkerInfo(); - - boolean stillHasUris = false; - for (Entry entry : mapWorkers.entrySet()) { - if (entry.getValue().getUrisCrawling().size() > 0) { - stillHasUris = true; - break; - } - } - - if (!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { + if (!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { LOGGER.info(" << FRONTIER IS TERMINATING! >> "); terminationMutex.release(); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java index f769cd7db..e0a47c073 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java @@ -12,6 +12,7 @@ /** * This is the main method creating and starting an instance of a * {@link Component} with the given class name. + * */ public class FrontierComponentStarter { @@ -26,6 +27,8 @@ public class FrontierComponentStarter { private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponentStarter.class); + + public static void main(String[] args) { addShutdownHook(); boolean success = true; @@ -41,6 +44,7 @@ public static void main(String[] args) { } finally { closeComponent(); } + if (!success) { System.exit(ERROR_EXIT_CODE); } @@ -51,7 +55,7 @@ private static synchronized void closeComponent() { if (!closed) { Closer.close(component, LOGGER); closed = true; - context.close(); + Closer.close(context, LOGGER); } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index 491132120..f0cb488ba 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -7,7 +7,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -33,8 +33,7 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti */ @SuppressWarnings("unused") public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUriRetreiver); } @@ -48,7 +47,6 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { super(normalizer, knownUriFilter, queue, doesRecrawling, outDatedUriRetreiver); } @@ -64,9 +62,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { - super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, outDatedUriRetreiver); + super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling,outDatedUriRetreiver); } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index d67c2e574..fb2074163 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -3,7 +3,7 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; @@ -222,9 +222,9 @@ public FrontierImpl(UriNormalizer normalizer, timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); - System.out.println("Frontier uri to recrawl: " + urisToRecrawl); - urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); +// List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); +// System.out.println("Frontier uri to recrawl: " + urisToRecrawl); +// urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } }, this.timerPeriod, this.timerPeriod); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 4ee2fbe12..f059a1ad6 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -7,21 +7,11 @@ public class FrontierQueryGenerator { - private static final FrontierQueryGenerator instance = new FrontierQueryGenerator(); - - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); private FrontierQueryGenerator() { } - /** - * Getter for {@link #instance}. - */ - public static FrontierQueryGenerator getInstance() { - return instance; - } - - /** * Return a time stamp query for the default graph. * It will return triples with time stamp contained in the default graph. @@ -29,11 +19,11 @@ public static FrontierQueryGenerator getInstance() { * @return All triples with time stamp in the default graph. */ - public Query getOutdatedUrisQuery() { + public static Query getOutdatedUrisQuery() { return getOutdatedUrisQuery(null, true); } - public Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { + public static Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("PREFIX sq: \n" + "PREFIX prov: \n" + @@ -71,6 +61,7 @@ public Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { stringBuilder.append("}"); Query query = QueryFactory.create(stringBuilder.toString()); + LOGGER.info("Outdated uri query:" + query); return query; } diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java similarity index 88% rename from squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java index e66754c88..0beb8d1c5 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/OutDatedUriRetreiver.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java @@ -1,4 +1,4 @@ -package org.dice_research.squirrel.data.uri.filter; +package org.dice_research.squirrel.frontier.recrawling; import java.io.Closeable; import java.util.List; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index c97b5201b..82ae83b2e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -18,7 +18,6 @@ import org.apache.jena.rdf.model.RDFNode; import org.apache.jena.sparql.core.DatasetDescription; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,8 +92,8 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { @Override public List getUriToRecrawl() { - // SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getInstance().getOutdatedUrisQuery(); + SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); System.out.println(getOutdatedUrisQuery); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 7dadca235..900bd8e7d 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -15,7 +15,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.filter.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java index c413f7961..8b912b2d9 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponent.java @@ -64,11 +64,15 @@ public void init() throws Exception { super.init(); UriSetRequest uriSetReq = new UriSetRequest(worker.getUri(), false); + uriSetRequest = serializer.serialize(uriSetReq); + deduplicationActive = EnvVariables.getBoolean(Constants.DEDUPLICATION_ACTIVE_KEY, Constants.DEFAULT_DEDUPLICATION_ACTIVE, LOGGER); + senderFrontier = DataSenderImpl.builder().queue(outgoingDataQueuefactory, Constants.FRONTIER_QUEUE_NAME) .build(); + if (deduplicationActive) { senderDeduplicator = DataSenderImpl.builder() .queue(outgoingDataQueuefactory, Constants.DEDUPLICATOR_QUEUE_NAME).build(); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentConfig.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentConfig.java index fe47a6fd2..95d04f58b 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentConfig.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentConfig.java @@ -29,7 +29,7 @@ public DataSender sender() throws IllegalStateException, IOException { @Bean(name = "client") public RabbitRpcClient client() throws IOException { RabbitRpcClient client = RabbitRpcClient.create(outgoingDataQueuefactory.getConnection(), - Constants.FRONTIER_QUEUE_NAME); + Constants.FRONTIER_QUEUE_NAME); return client; } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentStarter.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentStarter.java index e3c73d0c3..07638a691 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentStarter.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/components/WorkerComponentStarter.java @@ -57,7 +57,7 @@ private static synchronized void closeComponent() { if (closed == false) { Closer.close(component, LOGGER); closed = true; - context.close(); + Closer.close(context, LOGGER); } } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java index 5c1dcb0a2..fb8fb9ffd 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java @@ -99,7 +99,7 @@ public File fetch(CrawleableUri uri) { ActivityUtil.addStep(uri, getClass()); return dataFile; } - + protected File requestData(CrawleableUri uri, File outputFile) throws ClientProtocolException, FileNotFoundException, IOException { HttpGet request = null; @@ -165,6 +165,6 @@ protected File requestData(CrawleableUri uri, File outputFile) public void close() throws IOException { client.close(); } - +//setter } From 40599ccb9bc53ee01cb94a509c2e7b26524c8867 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 3 Nov 2019 12:48:47 +0100 Subject: [PATCH 019/102] Modify beans references in context files --- spring-config/frontier-context.xml | 26 +++++----- spring-config/worker-context-sparql.xml | 41 ++++++---------- spring-config/worker-context.xml | 49 ++++++++----------- .../components/FrontierComponent.java | 37 +++++++++----- .../components/FrontierComponentStarter.java | 14 ++---- .../recrawling/FrontierQueryGenerator.java | 2 +- .../recrawling/SparqlhostConnector.java | 2 +- 7 files changed, 81 insertions(+), 90 deletions(-) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index b2eec1793..35b72c73b 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -1,18 +1,12 @@ - + http://www.springframework.org/schema/context/spring-context.xsd"> + --> + class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer"/> @@ -51,7 +45,13 @@ - + + diff --git a/spring-config/worker-context-sparql.xml b/spring-config/worker-context-sparql.xml index 849697b9d..b49282ba7 100644 --- a/spring-config/worker-context-sparql.xml +++ b/spring-config/worker-context-sparql.xml @@ -1,18 +1,11 @@ - + http://www.springframework.org/schema/context/spring-context.xsd"> + - + + + + + @@ -92,14 +84,11 @@ class="org.dice_research.squirrel.fetcher.manage.SimpleOrderedFetcherManager"> - - - - - + @@ -114,10 +103,10 @@ class="org.dice_research.squirrel.fetcher.sparql.SparqlDatasetFetcher"> - + + class="org.dice_research.squirrel.fetcher.ckan.java.PaginatedCkanFetcher" /> diff --git a/spring-config/worker-context.xml b/spring-config/worker-context.xml index fffd02a77..9c91646ec 100644 --- a/spring-config/worker-context.xml +++ b/spring-config/worker-context.xml @@ -1,10 +1,7 @@ - --> - - - - - - - - - + + + + + + + + + @@ -45,17 +43,12 @@ + + + + + - @@ -94,11 +87,11 @@ - - - + + + + - diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index fd35e4c7c..793a5c4f3 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -4,12 +4,14 @@ import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.configurator.*; +import org.dice_research.squirrel.configurator.MongoConfiguration; +import org.dice_research.squirrel.configurator.SeedConfiguration; +import org.dice_research.squirrel.configurator.WebConfiguration; +import org.dice_research.squirrel.configurator.WhiteListConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -18,6 +20,7 @@ import org.dice_research.squirrel.frontier.ExtendedFrontier; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.impl.*; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -28,6 +31,7 @@ import org.dice_research.squirrel.rabbit.msgs.UriSet; import org.dice_research.squirrel.rabbit.msgs.UriSetRequest; import org.dice_research.squirrel.worker.AliveMessage; +import org.dice_research.squirrel.worker.WorkerInfo; import org.hobbit.core.components.AbstractComponent; import org.hobbit.core.data.RabbitQueue; import org.hobbit.core.rabbit.DataReceiver; @@ -36,6 +40,7 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; + import java.io.Closeable; import java.io.File; import java.io.IOException; @@ -134,7 +139,7 @@ public void init() throws Exception { @Override public void run() throws Exception { - TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex); + TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex, this.workerGuard); Timer timer = new Timer(); timer.schedule(terminatorTask, 5000, 5000); terminationMutex.acquire(); @@ -146,8 +151,8 @@ public void close() throws IOException { LOGGER.info("Closing Frontier Component."); if (receiver != null) // Force the receiver to close - receiver.close(); - // receiver.closeWhenFinished(); + // receiver.close(); + receiver.closeWhenFinished(); if (queue != null) queue.close(); @@ -262,19 +267,29 @@ private class TerminatorTask extends TimerTask { private UriQueue queue; private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); private Semaphore terminationMutex; + private WorkerGuard workerGuard; - public TerminatorTask(UriQueue queue, Semaphore terminationMutex) { + public TerminatorTask(UriQueue queue, Semaphore terminationMutex, WorkerGuard workerGuard) { this.queue = queue; this.terminationMutex = terminationMutex; + this.workerGuard = workerGuard; } @Override public void run() { - if (!hasUrisToCrawl.values().contains(true) && terminationCheck.shouldFrontierTerminate(queue)) { + + Map mapWorkers = this.workerGuard.getMapWorkerInfo(); + + boolean stillHasUris = false; + for (Map.Entry entry : mapWorkers.entrySet()) { + if (entry.getValue().getUrisCrawling().size() > 0) { + stillHasUris = true; + break; + } + } + if (!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { LOGGER.info(" << FRONTIER IS TERMINATING! >> "); - terminationMutex.release(); } + terminationMutex.release(); } - - } -} + }} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java index e0a47c073..18f76d5e7 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java @@ -1,13 +1,13 @@ package org.dice_research.squirrel.components; -import java.io.File; - import org.dice_research.squirrel.utils.Closer; import org.hobbit.core.components.Component; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.support.FileSystemXmlApplicationContext; +import java.io.File; + /** * This is the main method creating and starting an instance of a @@ -18,17 +18,11 @@ public class FrontierComponentStarter { private static final int ERROR_EXIT_CODE = 1; - + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponentStarter.class); private static FileSystemXmlApplicationContext context; - private static Component component; - private static boolean closed = false; - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponentStarter.class); - - - public static void main(String[] args) { addShutdownHook(); boolean success = true; @@ -55,7 +49,7 @@ private static synchronized void closeComponent() { if (!closed) { Closer.close(component, LOGGER); closed = true; - Closer.close(context, LOGGER); + context.close(); } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index f059a1ad6..980facdd6 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -7,7 +7,7 @@ public class FrontierQueryGenerator { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); private FrontierQueryGenerator() { } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index 82ae83b2e..7161fa1a3 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -92,7 +92,7 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { @Override public List getUriToRecrawl() { - SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + //SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); System.out.println(getOutdatedUrisQuery); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); From 54bed1479243a12a88c76797a2cdbcae786bcbf6 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 3 Nov 2019 19:50:09 +0100 Subject: [PATCH 020/102] minor changes in FrontierQueryGenerator and FrontierImp.java --- spring-config/frontier-context.xml | 3 +-- .../components/FrontierComponent.java | 4 ++-- .../squirrel/frontier/impl/FrontierImpl.java | 7 +++--- .../recrawling/FrontierQueryGenerator.java | 24 +++++++++---------- .../recrawling/SparqlhostConnector.java | 2 -- 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 35b72c73b..fea79b3ff 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -46,12 +46,11 @@ - diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 793a5c4f3..36c888840 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -58,7 +58,8 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; - @Qualifier("queueBean") + @Qualifier("sparqlBean") + //@Qualifier("queueBean") @Autowired protected UriQueue queue; protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; @@ -83,7 +84,6 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - SparqlhostConnector sp = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); hasUrisToCrawl = new HashMap(); if (mongoConfiguration != null) { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index fb2074163..079edb987 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -16,7 +16,6 @@ import org.dice_research.squirrel.uri.processing.UriProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import java.net.InetAddress; import java.net.UnknownHostException; import java.util.*; @@ -222,9 +221,9 @@ public FrontierImpl(UriNormalizer normalizer, timerRecrawling.schedule(new TimerTask() { @Override public void run() { -// List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); -// System.out.println("Frontier uri to recrawl: " + urisToRecrawl); -// urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); + List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); + LOGGER.info("URI to recrawl" + urisToRecrawl); + urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } }, this.timerPeriod, this.timerPeriod); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 980facdd6..86ebe0b47 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -19,22 +19,20 @@ private FrontierQueryGenerator() { * @return All triples with time stamp in the default graph. */ - public static Query getOutdatedUrisQuery() { - return getOutdatedUrisQuery(null, true); - } - public static Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { + + public static Query getOutdatedUrisQuery() { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("PREFIX sq: \n" + "PREFIX prov: \n" + "PREFIX xsd: " + "SELECT ?uri WHERE { \n "); // + "SELECT ?uri WHERE { \n "); - if (!defaultGraph) { - stringBuilder.append("GRAPH <"); - stringBuilder.append(graphID); - stringBuilder.append("> { "); - } + //if (!defaultGraph) { + // stringBuilder.append("GRAPH <"); + // stringBuilder.append(graphID); + // stringBuilder.append("> { "); + //} stringBuilder.append("{\n" + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + "WHERE{\n" + @@ -51,11 +49,11 @@ public static Query getOutdatedUrisQuery(String graphID, boolean defaultGraph) { " } \n" + "}\n" + "}\n" + - "FILTER(?diff > \"18000\"^^xsd:double)\n" + + "FILTER(?diff > \"60\"^^xsd:double)\n" + ""); - if (!defaultGraph) { - stringBuilder.append("}"); - } + //if (!defaultGraph) { + // stringBuilder.append("}"); + //} // stringBuilder.append("}GROUP BY ?uri"); stringBuilder.append("}"); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index 7161fa1a3..07fc9126e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -27,7 +27,6 @@ import java.util.ArrayList; import java.util.List; -@SuppressWarnings("deprecation") public class SparqlhostConnector implements OutDatedUriRetreiver { private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); @@ -94,7 +93,6 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { public List getUriToRecrawl() { //SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); - System.out.println(getOutdatedUrisQuery); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); while (rs.hasNext()) { From 5dd2c03dfc7d898af36d3816d82fed99709dde40 Mon Sep 17 00:00:00 2001 From: param-jot Date: Tue, 5 Nov 2019 15:38:56 +0100 Subject: [PATCH 021/102] change recrawling time, add javadoc comments, delete outdated code --- .../data/uri/filter/KnownUriFilter.java | 2 +- .../components/FrontierComponent.java | 20 ++-- .../components/FrontierComponentStarter.java | 2 +- .../data/uri/filter/MongoDBKnowUriFilter.java | 3 +- .../data/uri/filter/RDBKnownUriFilter.java | 14 +-- .../frontier/impl/ExtendedFrontierImpl.java | 14 +-- .../squirrel/frontier/impl/FrontierImpl.java | 98 ++++++++++--------- .../recrawling/FrontierQueryGenerator.java | 23 +---- ...treiver.java => OutDatedUriRetriever.java} | 5 +- .../recrawling/SparqlhostConnector.java | 8 +- .../frontier/impl/FrontierImplTest.java | 6 +- .../impl/CkanSeedGeneratorImplTest.java | 2 - .../squirrel/fetcher/http/HTTPFetcher.java | 2 - 13 files changed, 88 insertions(+), 111 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/{OutDatedUriRetreiver.java => OutDatedUriRetriever.java} (61%) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java index dc4aaadd8..92f00e23c 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java @@ -46,4 +46,4 @@ public interface KnownUriFilter extends UriFilter { * Opens the queue and allocates necessary resources. */ public void open(); -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 36c888840..e8603e51c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -20,8 +20,7 @@ import org.dice_research.squirrel.frontier.ExtendedFrontier; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.impl.*; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; -import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.rabbit.RPCServer; @@ -58,8 +57,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; - @Qualifier("sparqlBean") - //@Qualifier("queueBean") + @Qualifier("queueBean") @Autowired protected UriQueue queue; protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; @@ -67,7 +65,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("knowUriFilterBean") @Autowired private KnownUriFilter knownUriFilter; - private OutDatedUriRetreiver outDatedUriRetreiver; + private OutDatedUriRetriever outDatedUriRetriever; private URIReferences uriReferences = null; private Frontier frontier; private RabbitQueue rabbitQueue; @@ -105,7 +103,7 @@ public void init() throws Exception { knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); } // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUriRetreiver); + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUriRetriever); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); @@ -197,7 +195,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu if (deserializedData instanceof UriSetRequest) { responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); } else if (deserializedData instanceof UriSet) { - // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); + // LOGGER.warn("Received a set of URIs (size={}).", ((UriSet) deserializedData).uris.size()); frontier.addNewUris(((UriSet) deserializedData).uris); } else if (deserializedData instanceof CrawlingResult) { CrawlingResult crawlingResult = (CrawlingResult) deserializedData; @@ -225,11 +223,8 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue uris == null ? "null" : Integer.toString(uris.size())); handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); if (uris != null && uris.size() > 0) { - hasUrisToCrawl.put(uriSetRequest.getWorkerId(), true); - workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), - uriSetRequest.workerSendsAliveMessages(), uris); - } else { - hasUrisToCrawl.put(uriSetRequest.getWorkerId(), false); + workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), + uris); } } catch (IOException e) { LOGGER.error("Couldn't serialize new URI set.", e); @@ -239,6 +234,7 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue } } + protected void processSeedFile(String seedFile) { try { List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java index 18f76d5e7..eed7b7f71 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponentStarter.java @@ -49,7 +49,7 @@ private static synchronized void closeComponent() { if (!closed) { Closer.close(component, LOGGER); closed = true; - context.close(); + Closer.close(context, LOGGER); } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 4891e4400..61b9df9ac 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -41,6 +41,7 @@ public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable, UriHashCustodian { private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); + FrontierImpl frontierImpl; private MongoClient client; private MongoDatabase mongoDB; @@ -163,7 +164,7 @@ public List getOutdatedUris() { // get all uris with the following property: // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); + long generalRecrawlTime = Math.max(frontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, frontierImpl.getGeneralRecrawlTime()); Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), Filters.or( diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java index 095bfc0b0..f1ba4a090 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java @@ -32,7 +32,7 @@ @SuppressWarnings("deprecation") public class RDBKnownUriFilter implements KnownUriFilter, Closeable, UriHashCustodian { private static final Logger LOGGER = LoggerFactory.getLogger(RDBKnownUriFilter.class); - + FrontierImpl frontierImpl; private RDBConnector connector = null; private RethinkDB r; @@ -125,7 +125,7 @@ public List getOutdatedUris() { // get all uris with the following property: // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); + long generalRecrawlTime = Math.max(frontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, frontierImpl.getGeneralRecrawlTime()); Cursor cursor = r.db(DATABASE_NAME) .table(TABLE_NAME) @@ -177,10 +177,10 @@ public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimest try { // FIXME Fix this implementation // if (r.db(DATABASE_NAME).table(TABLE_NAME).filter(doc -> doc.getField(COLUMN_URI).eq(uri.getUri().toString())).isEmpty().run(connector.connection)) { - r.db(DATABASE_NAME) - .table(TABLE_NAME) - .insert(convertURITimestampToRDB(uri, lastCrawlTimestamp, nextCrawlTimestamp, false, DUMMY_HASH_VALUE)) - .run(connector.connection); + r.db(DATABASE_NAME) + .table(TABLE_NAME) + .insert(convertURITimestampToRDB(uri, lastCrawlTimestamp, nextCrawlTimestamp, false, DUMMY_HASH_VALUE)) + .run(connector.connection); // } else { // ReqlExpr row = r.db(DATABASE_NAME).table(TABLE_NAME).filter(doc -> doc.getField(COLUMN_URI).eq(uri.getUri().toString())); // row.update(r.hashMap(COLUMN_CRAWLING_IN_PROCESS, false)); @@ -289,4 +289,4 @@ public void purge() { public long count() { return r.db(DATABASE_NAME).table(TABLE_NAME).count().run(connector.connection); } -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index f0cb488ba..fad1e8c29 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -7,7 +7,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -33,8 +33,8 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti */ @SuppressWarnings("unused") public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { - super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUriRetreiver); + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetriever outDatedUriRetriever) { + super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, outDatedUriRetriever); } /** @@ -47,8 +47,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { - super(normalizer, knownUriFilter, queue, doesRecrawling, outDatedUriRetreiver); + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { + super(normalizer, knownUriFilter, queue, doesRecrawling, outDatedUriRetriever); } /** @@ -62,8 +62,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { - super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling,outDatedUriRetreiver); + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { + super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, outDatedUriRetriever); } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 079edb987..ce4578d71 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -3,22 +3,24 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.graph.GraphLogger; -import org.dice_research.squirrel.queue.IpAddressBasedQueue; +import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.uri.processing.UriProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.InetAddress; + import java.net.UnknownHostException; -import java.util.*; +import java.util.List; +import java.util.Timer; +import java.util.TimerTask; /** * Standard implementation of the {@link Frontier} interface containing a @@ -31,17 +33,18 @@ public class FrontierImpl implements Frontier { /** * Default value for {@link #generalRecrawlTime} (one week). */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 18000; + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); /** * Default value for {@link #timerPeriod}. */ - private static final long DEFAULT_TIMER_PERIOD = 18000; + private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; + public long generalRecrawlTime; /** * Time (in milliseconds) after which uris will be recrawled (only used if no * specific time is configured for a URI). */ - private static long generalRecrawlTime; + /** * {@link UriNormalizer} used to transform given URIs into a normal form. */ @@ -50,7 +53,10 @@ public class FrontierImpl implements Frontier { * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ protected KnownUriFilter knownUriFilter; - protected OutDatedUriRetreiver outDatedUriRetreiver; + /** + * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. + */ + protected OutDatedUriRetriever outDatedUriRetriever; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to * identify URIs that already have been crawled. @@ -104,9 +110,9 @@ public class FrontierImpl implements Frontier { * @param timerPeriod used to select if URIs should be recrawled. */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, OutDatedUriRetreiver outDatedUriRetreiver) { + long generalRecrawlTime, long timerPeriod, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod, outDatedUriRetreiver); + generalRecrawlTime, timerPeriod, outDatedUriRetriever); } /** @@ -126,9 +132,9 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetreiver outDatedUriRetreiver) { + long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, - timerPeriod, outDatedUriRetreiver); + timerPeriod, outDatedUriRetriever); } /** @@ -144,9 +150,9 @@ public FrontierImpl(UriNormalizer normalizer, * @param doesRecrawling Value for {@link #doesRecrawling}. */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); } /** @@ -161,9 +167,9 @@ public FrontierImpl(UriNormalizer normalizer, * @param doesRecrawling Value for {@link #doesRecrawling}. */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUriRetreiver outDatedUriRetreiver) { + KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, knownUriFilter, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); } /** @@ -177,9 +183,9 @@ public FrontierImpl(UriNormalizer normalizer, * crawled. */ public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetreiver outDatedUriRetreiver) { + KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, outDatedUriRetreiver); + DEFAULT_TIMER_PERIOD, outDatedUriRetriever); } /** @@ -198,30 +204,30 @@ public FrontierImpl(UriNormalizer normalizer, * be recrawled. If Value is null the default Time is * used. * @param timerPeriod used to select if URIs should be recrawled. - * @param outDatedUriRetreiver + * @param outDatedUriRetriever */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, - boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUriRetreiver outDatedUriRetreiver) { + boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUriRetriever outDatedUriRetriever) { this.normalizer = normalizer; this.knownUriFilter = knownUriFilter; this.uriReferences = uriReferences; this.queue = queue; this.uriProcessor = new UriProcessor(); this.graphLogger = graphLogger; - this.outDatedUriRetreiver = outDatedUriRetreiver; + this.outDatedUriRetriever = outDatedUriRetriever; this.queue.open(); this.doesRecrawling = doesRecrawling; this.timerPeriod = timerPeriod; - FrontierImpl.generalRecrawlTime = generalRecrawlTime; + this.generalRecrawlTime = generalRecrawlTime; if (this.doesRecrawling) { timerRecrawling = new Timer(); timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = outDatedUriRetreiver.getUriToRecrawl(); + List urisToRecrawl = outDatedUriRetriever.getUriToRecrawl(); LOGGER.info("URI to recrawl" + urisToRecrawl); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } @@ -229,7 +235,7 @@ public void run() { } } - public static long getGeneralRecrawlTime() { + public long getGeneralRecrawlTime() { return generalRecrawlTime; } @@ -285,30 +291,24 @@ public void addNewUri(CrawleableUri uri) { @Override public void crawlingDone(List uris) { LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); -// List newUris = new ArrayList<>(uriMap.size()); -// for (CrawleableUri uri : uriMap.keySet()) { -// newUris.addAll(uriMap.get(uri)); -// knownUriFilter.add(uri, System.currentTimeMillis(), uri.getTimestampNextCrawl()); -// if (uriReferences != null) { -// uriReferences.add(uri, uriMap.get(uri)); -// } -// } -// // If there is a graph logger, log the data -// if (graphLogger != null) { -// graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); -// } + // List newUris = new ArrayList<>(uriMap.size()); + // for (CrawleableUri uri : uriMap.keySet()) { + // newUris.addAll(uriMap.get(uri)); + // knownUriFilter.add(uri, System.currentTimeMillis(), + // uri.getTimestampNextCrawl()); + // if (uriReferences != null) { + // uriReferences.add(uri, uriMap.get(uri)); + // } + // } + + // // If there is a graph logger, log the data + // if (graphLogger != null) { + // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); + // } // If we should give the crawled IPs to the queue - if (queue instanceof IpAddressBasedQueue) { - Set ips = new HashSet<>(); - InetAddress ip; - for (CrawleableUri uri : uris) { - ip = uri.getIpAddress(); - if (ip != null) { - ips.add(ip); - } - } - ips.forEach(_ip -> ((IpAddressBasedQueue) queue).markIpAddressAsAccessible(_ip)); + if (queue instanceof BlockingQueue) { + ((BlockingQueue) queue).markUrisAsAccessible(uris); } // send list of crawled URIs to the knownUriFilter for (CrawleableUri uri : uris) { @@ -328,8 +328,10 @@ public void crawlingDone(List uris) { @Override public int getNumberOfPendingUris() { - if (queue instanceof IpAddressBasedQueue) { - return ((IpAddressBasedQueue) queue).getNumberOfBlockedIps(); + // TODO this implementation does not fit to the semantics of the method name + // since it returns the number of URI groups instead of the number of URIs + if (queue instanceof BlockingQueue) { + return ((BlockingQueue) queue).getNumberOfBlockedKeys(); } else { return 0; } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 86ebe0b47..0247e94c3 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -13,9 +13,7 @@ private FrontierQueryGenerator() { } /** - * Return a time stamp query for the default graph. - * It will return triples with time stamp contained in the default graph. - * + * Return outdated uris by comparing their endtime stamps. * @return All triples with time stamp in the default graph. */ @@ -26,14 +24,8 @@ public static Query getOutdatedUrisQuery() { stringBuilder.append("PREFIX sq: \n" + "PREFIX prov: \n" + "PREFIX xsd: " - + "SELECT ?uri WHERE { \n "); - // + "SELECT ?uri WHERE { \n "); - //if (!defaultGraph) { - // stringBuilder.append("GRAPH <"); - // stringBuilder.append(graphID); - // stringBuilder.append("> { "); - //} - stringBuilder.append("{\n" + + + "SELECT ?uri WHERE { \n "+ + "{\n" + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + "WHERE{\n" + "\n" + @@ -49,17 +41,10 @@ public static Query getOutdatedUrisQuery() { " } \n" + "}\n" + "}\n" + - "FILTER(?diff > \"60\"^^xsd:double)\n" + + "FILTER(?diff > \"60\"^^xsd:double)}\n" + ""); - //if (!defaultGraph) { - // stringBuilder.append("}"); - //} - - // stringBuilder.append("}GROUP BY ?uri"); - stringBuilder.append("}"); Query query = QueryFactory.create(stringBuilder.toString()); - LOGGER.info("Outdated uri query:" + query); return query; } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java similarity index 61% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java index 0beb8d1c5..acfa54435 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetreiver.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java @@ -1,15 +1,16 @@ package org.dice_research.squirrel.frontier.recrawling; import java.io.Closeable; +import java.io.IOException; import java.util.List; import org.dice_research.squirrel.data.uri.CrawleableUri; -public interface OutDatedUriRetreiver extends Closeable { +public interface OutDatedUriRetriever extends Closeable { /** - * Returns all {@link CrawleableUri}s which have to be recrawled. This means their time to next crawl has passed. + * Returns all {@link CrawleableUri}s which are crawled a week ago and have to be recrawled. * * @return The outdated {@link CrawleableUri}s. */ diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index 07fc9126e..dac1a5f3c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -27,20 +27,18 @@ import java.util.ArrayList; import java.util.List; -public class SparqlhostConnector implements OutDatedUriRetreiver { +public class SparqlhostConnector implements OutDatedUriRetriever { private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); /** * The Query factory used to query the SPARQL endpoint. */ - protected static QueryExecutionFactory queryExecFactory = null; - protected UpdateExecutionFactory updateExecFactory = null; + protected QueryExecutionFactory queryExecFactory = null; List urisToRecrawl = new ArrayList<>(); public SparqlhostConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { this.queryExecFactory = queryExecFactory; - this.updateExecFactory = updateExecFactory; LOGGER.info("Connected"); } @@ -91,7 +89,6 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { @Override public List getUriToRecrawl() { - //SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); @@ -108,7 +105,6 @@ public List getUriToRecrawl() { return urisToRecrawl; } - @Override public void close() throws IOException { getUriToRecrawl(); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 900bd8e7d..e6adbc8bb 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -15,7 +15,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetreiver; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; @@ -37,7 +37,7 @@ public class FrontierImplTest { private static MongoDBKnowUriFilter filter; private static List uris = new ArrayList(); private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private OutDatedUriRetreiver outDatedUriRetreiver = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); + private OutDatedUriRetriever outDatedUriRetriever = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); @Before public void setUp() throws Exception { @@ -46,7 +46,7 @@ public void setUp() throws Exception { queue = new MongoDBIpBasedQueue("localhost", 58027); filter.open(); queue.open(); - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUriRetreiver); + frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUriRetriever); uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE)); uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 94e3e73d8..3c7214edc 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -24,8 +24,6 @@ public class CkanSeedGeneratorImplTest extends TestCase { private Frontier frontier; - // private OutDatedUriRetreiver outDatedUriRetreiver = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); - public void setUp() { queue = new InMemoryQueue(); frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,null); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java index fb8fb9ffd..1e2e1ffe2 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/http/HTTPFetcher.java @@ -165,6 +165,4 @@ protected File requestData(CrawleableUri uri, File outputFile) public void close() throws IOException { client.close(); } -//setter - } From 126f79b539b42f9e734dea7bc98651101022e42c Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 6 Nov 2019 18:56:40 +0100 Subject: [PATCH 022/102] added frontier-context-sparql.xml --- docker-compose-sparql.yml | 2 +- spring-config/frontier-context-sparql.xml | 56 +++++++++++++++++++ spring-config/frontier-context.xml | 7 --- .../components/FrontierComponent.java | 4 +- .../frontier/impl/FrontierImplTest.java | 3 +- 5 files changed, 60 insertions(+), 12 deletions(-) create mode 100644 spring-config/frontier-context-sparql.xml diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index 6c01e48c8..ec3a685f2 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -15,7 +15,7 @@ services: environment: - HOBBIT_RABBIT_HOST=rabbit - URI_WHITELIST_FILE=/var/squirrel/whitelist.txt - - FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context.xml + - FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context-sparql.xml - SEED_FILE=/var/squirrel/seeds.txt - SPARQL_URL=http://virtuosohost:8890/sparql-auth/ - SPARQL_HOST_USER=dba diff --git a/spring-config/frontier-context-sparql.xml b/spring-config/frontier-context-sparql.xml new file mode 100644 index 000000000..ba9c902ae --- /dev/null +++ b/spring-config/frontier-context-sparql.xml @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index fea79b3ff..185044fed 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -45,12 +45,5 @@ - - - - - - - diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index e8603e51c..ebae7888f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -1,7 +1,6 @@ package org.dice_research.squirrel.components; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; @@ -61,7 +60,6 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Autowired protected UriQueue queue; protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; - protected UpdateExecutionFactory updateExecFactory = null; @Qualifier("knowUriFilterBean") @Autowired private KnownUriFilter knownUriFilter; @@ -74,6 +72,8 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Autowired private Serializer serializer; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; + @Qualifier("sparqlBean") + @Autowired private Map hasUrisToCrawl; @Override diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index e6adbc8bb..5c71af885 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -37,7 +37,6 @@ public class FrontierImplTest { private static MongoDBKnowUriFilter filter; private static List uris = new ArrayList(); private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); - private OutDatedUriRetriever outDatedUriRetriever = SparqlhostConnector.create("http://localhost:8890/sparql-auth", "dba", "pw123"); @Before public void setUp() throws Exception { @@ -46,7 +45,7 @@ public void setUp() throws Exception { queue = new MongoDBIpBasedQueue("localhost", 58027); filter.open(); queue.open(); - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null, outDatedUriRetriever); + frontier = new FrontierImpl(new NormalizerImpl(), filter, queue, true, 18000, 18000, null,null); uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE)); uris.add(cuf.create(new URI("http://dbpedia.org/resource/Moscow"), InetAddress.getByName("127.0.0.1"), From 8d6f653ecc5a97317f047923de375ba9eea9fc32 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 10 Nov 2019 21:46:18 +0100 Subject: [PATCH 023/102] added spark dependency in worker --- squirrel.worker/pom.xml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/squirrel.worker/pom.xml b/squirrel.worker/pom.xml index c5b557f9d..85522f560 100644 --- a/squirrel.worker/pom.xml +++ b/squirrel.worker/pom.xml @@ -131,6 +131,13 @@ 1.19.1 + + org.apache.spark + spark-core_2.12 + 2.4.4 + + + @@ -177,4 +184,4 @@ - \ No newline at end of file + From 7ba6f9d38fc12289f2a56d6d96f0414d85ea6952 Mon Sep 17 00:00:00 2001 From: param-jot Date: Mon, 25 Nov 2019 00:34:00 +0100 Subject: [PATCH 024/102] Added squirrel.crawled.graph module, context file for the module and spark dependencies --- pom.xml | 1 + spring-config/crawled-graph-context.xml | 23 +++++++++++ squirrel.crawled.graph/pom.xml | 39 +++++++++++++++++++ .../src/main/java/CrawledDataRetriever.java | 2 + squirrel.worker/pom.xml | 7 ---- 5 files changed, 65 insertions(+), 7 deletions(-) create mode 100644 spring-config/crawled-graph-context.xml create mode 100644 squirrel.crawled.graph/pom.xml create mode 100644 squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java diff --git a/pom.xml b/pom.xml index 057e6fae9..39c704514 100644 --- a/pom.xml +++ b/pom.xml @@ -41,6 +41,7 @@ squirrel.frontier squirrel.mockup squirrel.worker + squirrel.crawled.graph diff --git a/spring-config/crawled-graph-context.xml b/spring-config/crawled-graph-context.xml new file mode 100644 index 000000000..adf387ac0 --- /dev/null +++ b/spring-config/crawled-graph-context.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + diff --git a/squirrel.crawled.graph/pom.xml b/squirrel.crawled.graph/pom.xml new file mode 100644 index 000000000..b6f96ef0f --- /dev/null +++ b/squirrel.crawled.graph/pom.xml @@ -0,0 +1,39 @@ + + + + squirrel + org.dice-research + 0.4.0 + + 4.0.0 + + squirrel.crawled.graph + + + + + org.apache.spark + spark-core_2.12 + 2.4.4 + + + + + org.apache.spark + spark-graphx_2.12 + 2.4.4 + + + org.dice-research + squirrel.frontier + 0.4.0 + compile + + + + + + + diff --git a/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java b/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java new file mode 100644 index 000000000..0231896d7 --- /dev/null +++ b/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java @@ -0,0 +1,2 @@ +public class CrawledDataRetriever { +} diff --git a/squirrel.worker/pom.xml b/squirrel.worker/pom.xml index 85522f560..5e144f337 100644 --- a/squirrel.worker/pom.xml +++ b/squirrel.worker/pom.xml @@ -131,13 +131,6 @@ 1.19.1 - - org.apache.spark - spark-core_2.12 - 2.4.4 - - - From cf4f52b67c269dff21cf080ef6ebe677ca28fada Mon Sep 17 00:00:00 2001 From: param-jot Date: Mon, 25 Nov 2019 01:31:55 +0100 Subject: [PATCH 025/102] An example program to create graph using Spark GraphX --- squirrel.crawled.graph/pom.xml | 6 + .../src/main/java/CrawledDataRetriever.java | 2 - .../squirrel/CrawledDataRetriever.java | 136 ++++++++++++++++++ 3 files changed, 142 insertions(+), 2 deletions(-) delete mode 100644 squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java create mode 100644 squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java diff --git a/squirrel.crawled.graph/pom.xml b/squirrel.crawled.graph/pom.xml index b6f96ef0f..a13c0bdd1 100644 --- a/squirrel.crawled.graph/pom.xml +++ b/squirrel.crawled.graph/pom.xml @@ -32,6 +32,12 @@ compile + + org.apache.spark + spark-sql_2.12 + 2.4.4 + provided + diff --git a/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java b/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java deleted file mode 100644 index 0231896d7..000000000 --- a/squirrel.crawled.graph/src/main/java/CrawledDataRetriever.java +++ /dev/null @@ -1,2 +0,0 @@ -public class CrawledDataRetriever { -} diff --git a/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java b/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java new file mode 100644 index 000000000..fe991723a --- /dev/null +++ b/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java @@ -0,0 +1,136 @@ +package org.dice_research.squirrel; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +import scala.Tuple2; +import scala.reflect.ClassTag; +import scala.reflect.ClassTag$; +import scala.runtime.AbstractFunction1; +import scala.runtime.AbstractFunction2; +import scala.runtime.BoxedUnit; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.graphx.*; +import org.apache.spark.rdd.RDD; +import org.apache.spark.storage.StorageLevel; + +public class CrawledDataRetriever +{ + // sendMsg and mergeMsg supplied to aggregateMessages()need to be + // both Scala (for GraphX API) and Serializable (for Spark) + static abstract class SerializableFunction1 + extends AbstractFunction1 implements Serializable {} + + static abstract class SerializableFunction2 + extends AbstractFunction2 implements Serializable {} + + public static void main(String[] args) { + JavaSparkContext sc = new JavaSparkContext( + new SparkConf().setMaster("local").setAppName("EdgeCount")); + + JavaRDD> myVertices = + sc.parallelize(Arrays.asList(new Tuple2(1L, "Ann"), + new Tuple2(2L, "Bill"), + + new Tuple2(3L, "Charles"), + new Tuple2(4L, "Diane"), + new Tuple2(5L, "Went to gym this morning"))); + + JavaRDD> myEdges = sc.parallelize(Arrays.asList( + new Edge(1L, 2L, "is-friends-with"), + new Edge(2L, 3L, "is-friends-with"), + new Edge(3L, 4L, "is-friends-with"), + new Edge(4L, 5L, "Likes-status"), + new Edge(3L, 5L, "Wrote-status"))); + + Graph myGraph = Graph.apply(myVertices.rdd(), + myEdges.rdd(), "", StorageLevel.MEMORY_ONLY(), + StorageLevel.MEMORY_ONLY(), tagString, tagString); + + Graph initialGraph = myGraph.mapVertices( + new SerializableFunction2() { + public Integer apply(Object o, String s) { return 0; } + }, + tagInteger, null); + + List> ls = toJavaPairRDD( + propagateEdgeCount(initialGraph).vertices(), tagInteger).collect(); + + for (Tuple2 t : ls) + System.out.print(t + " ** "); + + System.out.println(); + + sc.stop(); + } + + // Must explicitly provide for implicit Scala parameters in various + // function calls + private static final ClassTag tagInteger = + ClassTag$.MODULE$.apply(Integer.class); + private static final ClassTag tagString = + ClassTag$.MODULE$.apply(String.class); + private static final ClassTag tagObject = + ClassTag$.MODULE$.apply(Object.class); + + // sendMsg + private static final SerializableFunction1< + EdgeContext, BoxedUnit> sendMsg = + new SerializableFunction1< + EdgeContext, BoxedUnit>() { + public BoxedUnit apply(EdgeContext ec) { + ec.sendToDst(ec.srcAttr()+1); + return BoxedUnit.UNIT; + } + }; + + // mergeMsg + private static final SerializableFunction2 + mergeMsg = new SerializableFunction2() { + public Integer apply(Integer a, Integer b) { + return Math.max(a,b); + } + }; + + + private static JavaPairRDD + toJavaPairRDD(VertexRDD v, ClassTag tagT) { + return new JavaPairRDD((RDD>)v, + tagObject, tagT); + } + + private static Graph propagateEdgeCount( + Graph g) { + VertexRDD verts = g.aggregateMessages( + sendMsg, mergeMsg, TripletFields.All, tagInteger); + Graph g2 = Graph.apply(verts, g.edges(), 0, + StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), + tagInteger, tagString); + int check = toJavaPairRDD(g2.vertices(), tagInteger) + .join(toJavaPairRDD(g.vertices(), tagInteger)) + .map(new Function>, + Integer>() { + public Integer call(Tuple2> t) { + return t._2._1 - t._2._2; + } + }) + .reduce(new Function2() { + public Integer call(Integer a, Integer b) {return a+b;} + }); + if (check > 0) + return propagateEdgeCount(g2); + else + return g; + } +} + /* int check = toJavaPairRDD(g2.vertices(), tagInteger) + .join(toJavaPairRDD(g.vertices(), tagInteger)) + .map(t -> t._2._1 - t._2._2) + .reduce((a,b) -> a+b); */ + + From ed41aa0e89da4194685758c29a98ddbce4ba3ea8 Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 27 Nov 2019 11:11:02 +0100 Subject: [PATCH 026/102] changes in FrontierImplTest.java to add in-memory SPARQL endpoint(not finished yet) --- .../frontier/impl/FrontierImplTest.java | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 5c71af885..814be652e 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -8,18 +8,22 @@ import java.net.URI; import java.util.ArrayList; import java.util.List; + +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryDataset; +import org.apache.jena.query.Dataset; +import org.apache.jena.query.DatasetFactory; +import org.apache.jena.rdf.model.ModelFactory; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; - import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; - -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; -import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; - import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; +import org.dice_research.squirrel.metadata.CrawlingActivity; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; @@ -155,21 +159,21 @@ public void simlpeRecrawling() throws Exception { @Test public void RecrawlingTest() throws Exception { - // Add the URIs to the frontier - List uris = new ArrayList<>(); + Dataset dataset = DatasetFactory.create(); + dataset.setDefaultModel(ModelFactory.createDefaultModel()); + QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); + UpdateExecutionFactory updateExecFactory = new UpdateExecutionFactoryDataset(dataset); + + CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); + uri.addData(Constants.UUID_KEY, "123"); + CrawlingActivity activity = new CrawlingActivity(uri, "http://example.org/testRecrawling"); + uri.addData(Constants.URI_CRAWLING_ACTIVITY, activity); + /* List uris = new ArrayList<>(); CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled")); uri_1.addData("endedAtTime", "2019-07-06T17:04:02.864Z"); CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri")); uri_2.addData("endedAtTime", "2019-07-06T19:38:02.864Z"); - uris.add(uri_1); - uris.add(uri_2); - frontier.addNewUris(uris); - List nextUris = frontier.getNextUris(); - for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); - } - for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); + uris.add(uri_1);Build); } // Set the first URI as recrawlable @@ -181,6 +185,8 @@ public void RecrawlingTest() throws Exception { Assert.assertNotNull(nextUris); assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); Assert.assertEquals(2, nextUris.size()); + + */ } @After From 9aa4494f595aacf73282635fdb93a0734b2639bb Mon Sep 17 00:00:00 2001 From: param-jot Date: Wed, 27 Nov 2019 11:41:38 +0100 Subject: [PATCH 027/102] removing crawled.graph module --- spring-config/crawled-graph-context.xml | 23 --- squirrel.crawled.graph/pom.xml | 45 ------ .../squirrel/CrawledDataRetriever.java | 136 ------------------ 3 files changed, 204 deletions(-) delete mode 100644 spring-config/crawled-graph-context.xml delete mode 100644 squirrel.crawled.graph/pom.xml delete mode 100644 squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java diff --git a/spring-config/crawled-graph-context.xml b/spring-config/crawled-graph-context.xml deleted file mode 100644 index adf387ac0..000000000 --- a/spring-config/crawled-graph-context.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - - - - - - diff --git a/squirrel.crawled.graph/pom.xml b/squirrel.crawled.graph/pom.xml deleted file mode 100644 index a13c0bdd1..000000000 --- a/squirrel.crawled.graph/pom.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - squirrel - org.dice-research - 0.4.0 - - 4.0.0 - - squirrel.crawled.graph - - - - - org.apache.spark - spark-core_2.12 - 2.4.4 - - - - - org.apache.spark - spark-graphx_2.12 - 2.4.4 - - - org.dice-research - squirrel.frontier - 0.4.0 - compile - - - - org.apache.spark - spark-sql_2.12 - 2.4.4 - provided - - - - - - diff --git a/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java b/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java deleted file mode 100644 index fe991723a..000000000 --- a/squirrel.crawled.graph/src/main/java/org/dice_research/squirrel/CrawledDataRetriever.java +++ /dev/null @@ -1,136 +0,0 @@ -package org.dice_research.squirrel; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; - -import scala.Tuple2; -import scala.reflect.ClassTag; -import scala.reflect.ClassTag$; -import scala.runtime.AbstractFunction1; -import scala.runtime.AbstractFunction2; -import scala.runtime.BoxedUnit; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.graphx.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.storage.StorageLevel; - -public class CrawledDataRetriever -{ - // sendMsg and mergeMsg supplied to aggregateMessages()need to be - // both Scala (for GraphX API) and Serializable (for Spark) - static abstract class SerializableFunction1 - extends AbstractFunction1 implements Serializable {} - - static abstract class SerializableFunction2 - extends AbstractFunction2 implements Serializable {} - - public static void main(String[] args) { - JavaSparkContext sc = new JavaSparkContext( - new SparkConf().setMaster("local").setAppName("EdgeCount")); - - JavaRDD> myVertices = - sc.parallelize(Arrays.asList(new Tuple2(1L, "Ann"), - new Tuple2(2L, "Bill"), - - new Tuple2(3L, "Charles"), - new Tuple2(4L, "Diane"), - new Tuple2(5L, "Went to gym this morning"))); - - JavaRDD> myEdges = sc.parallelize(Arrays.asList( - new Edge(1L, 2L, "is-friends-with"), - new Edge(2L, 3L, "is-friends-with"), - new Edge(3L, 4L, "is-friends-with"), - new Edge(4L, 5L, "Likes-status"), - new Edge(3L, 5L, "Wrote-status"))); - - Graph myGraph = Graph.apply(myVertices.rdd(), - myEdges.rdd(), "", StorageLevel.MEMORY_ONLY(), - StorageLevel.MEMORY_ONLY(), tagString, tagString); - - Graph initialGraph = myGraph.mapVertices( - new SerializableFunction2() { - public Integer apply(Object o, String s) { return 0; } - }, - tagInteger, null); - - List> ls = toJavaPairRDD( - propagateEdgeCount(initialGraph).vertices(), tagInteger).collect(); - - for (Tuple2 t : ls) - System.out.print(t + " ** "); - - System.out.println(); - - sc.stop(); - } - - // Must explicitly provide for implicit Scala parameters in various - // function calls - private static final ClassTag tagInteger = - ClassTag$.MODULE$.apply(Integer.class); - private static final ClassTag tagString = - ClassTag$.MODULE$.apply(String.class); - private static final ClassTag tagObject = - ClassTag$.MODULE$.apply(Object.class); - - // sendMsg - private static final SerializableFunction1< - EdgeContext, BoxedUnit> sendMsg = - new SerializableFunction1< - EdgeContext, BoxedUnit>() { - public BoxedUnit apply(EdgeContext ec) { - ec.sendToDst(ec.srcAttr()+1); - return BoxedUnit.UNIT; - } - }; - - // mergeMsg - private static final SerializableFunction2 - mergeMsg = new SerializableFunction2() { - public Integer apply(Integer a, Integer b) { - return Math.max(a,b); - } - }; - - - private static JavaPairRDD - toJavaPairRDD(VertexRDD v, ClassTag tagT) { - return new JavaPairRDD((RDD>)v, - tagObject, tagT); - } - - private static Graph propagateEdgeCount( - Graph g) { - VertexRDD verts = g.aggregateMessages( - sendMsg, mergeMsg, TripletFields.All, tagInteger); - Graph g2 = Graph.apply(verts, g.edges(), 0, - StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), - tagInteger, tagString); - int check = toJavaPairRDD(g2.vertices(), tagInteger) - .join(toJavaPairRDD(g.vertices(), tagInteger)) - .map(new Function>, - Integer>() { - public Integer call(Tuple2> t) { - return t._2._1 - t._2._2; - } - }) - .reduce(new Function2() { - public Integer call(Integer a, Integer b) {return a+b;} - }); - if (check > 0) - return propagateEdgeCount(g2); - else - return g; - } -} - /* int check = toJavaPairRDD(g2.vertices(), tagInteger) - .join(toJavaPairRDD(g.vertices(), tagInteger)) - .map(t -> t._2._1 - t._2._2) - .reduce((a,b) -> a+b); */ - - From 48a3187c50956312c6f768e3792bf432cece0de4 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 15 Dec 2019 21:34:14 +0100 Subject: [PATCH 028/102] Modify outDatedUri query and implement junit test --- .../data/uri/DefaultCrawleableUriFactory.java | 80 ++++---- .../uri/CrawleableUriSerializationTest.java | 188 +++++++++--------- .../squirrel/frontier/impl/FrontierImpl.java | 7 +- .../recrawling/FrontierQueryGenerator.java | 5 +- .../recrawling/OutDatedUriRetriever.java | 5 +- .../frontier/impl/FrontierImplTest.java | 56 +----- .../frontier/impl/RecrawlingTest.java | 55 +++++ 7 files changed, 209 insertions(+), 187 deletions(-) create mode 100644 squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/DefaultCrawleableUriFactory.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/DefaultCrawleableUriFactory.java index dbd9a39a3..7fed14ebc 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/DefaultCrawleableUriFactory.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/DefaultCrawleableUriFactory.java @@ -1,40 +1,40 @@ -package org.dice_research.squirrel.data.uri; - -import java.net.InetAddress; -import java.net.URI; -import java.net.UnknownHostException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class DefaultCrawleableUriFactory implements CrawleableUriFactory { - - private static final Logger LOGGER = LoggerFactory.getLogger(DefaultCrawleableUriFactory.class); - - @Override - public CrawleableUri create(String uri) { - try { - return create(new URI(uri)); - } catch (Exception e) { - LOGGER.info("The given URI \"" + uri + "\" couldn't be parsed. Returning null.", e); - return null; - } - } - - @Override - public CrawleableUri create(URI uri) { - return create(uri, UriType.UNKNOWN); - } - - @Override - public CrawleableUri create(URI uri, UriType type) { - try { - InetAddress ip = InetAddress.getByName(uri.getHost()); - return new CrawleableUri(uri, ip); - } catch (UnknownHostException e) { - LOGGER.info("Couldn't get the IP address for \"" + uri + "\". Returning null.", e); - } - return null; - } - -} +package org.dice_research.squirrel.data.uri; + +import java.net.InetAddress; +import java.net.URI; +import java.net.UnknownHostException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DefaultCrawleableUriFactory implements CrawleableUriFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(DefaultCrawleableUriFactory.class); + + @Override + public CrawleableUri create(String uri) { + try { + return create(new URI(uri)); + } catch (Exception e) { + LOGGER.info("The given URI \"" + uri + "\" couldn't be parsed. Returning null.", e); + return null; + } + } + + @Override + public CrawleableUri create(URI uri) { + return create(uri, UriType.UNKNOWN); + } + + @Override + public CrawleableUri create(URI uri, UriType type) { + try { + InetAddress ip = InetAddress.getByName(uri.getHost()); + return new CrawleableUri(uri, ip); + } catch (UnknownHostException e) { + LOGGER.info("Couldn't get the IP address for \"" + uri + "\". Returning null.", e); + } + return null; + } + +} diff --git a/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java b/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java index 9474e4e5b..fc0f8437d 100644 --- a/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java +++ b/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java @@ -1,94 +1,94 @@ -package org.aksw.simba.squirrel.data.uri; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.Serializable; -import java.net.InetAddress; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Arrays; -import java.util.Base64; -import java.util.Collection; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.serialize.Serializer; -import org.dice_research.squirrel.data.uri.serialize.gson.GsonUriSerializer; -import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; -import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; -import org.xerial.snappy.Snappy; - -import com.google.gson.Gson; - -@RunWith(Parameterized.class) -public class CrawleableUriSerializationTest { - - @Parameters - public static Collection data() throws Exception { - CrawleableUri temp = new CrawleableUri(new URI("http://localhost/test")); - temp.addData(Constants.URI_TYPE_KEY, Constants.URI_TYPE_VALUE_DEREF); - temp.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/json-ld"); - temp.addData(Constants.URI_HTTP_CHARSET_KEY, "utf-8"); - temp.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() + 100000L); - - return Arrays.asList(new Object[][] { { new CrawleableUri(new URI("http://localhost/test")) }, - { new CrawleableUri(new URI("http://google.de")) }, - { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), - UriType.DEREFERENCEABLE) }, - { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), - UriType.DUMP) }, - { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), - UriType.SPARQL) }, - { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), - UriType.UNKNOWN) }, - { new CrawleableUri(new URI("http://dbpedia.org"), null, UriType.SPARQL) }, - { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("255.255.255.255")) }, - { temp } }); - } - - private CrawleableUri uri; - - public CrawleableUriSerializationTest(CrawleableUri uri) { - this.uri = uri; - } - - @Test - public void testGSON() throws URISyntaxException, ClassNotFoundException, IOException { - executeTest(new GsonUriSerializer(), "Gson"); - } - - @Test - public void testJavaWithSnappy() throws URISyntaxException, ClassNotFoundException, IOException { - executeTest(new SnappyJavaUriSerializer(), "Snappy"); - } - - @Test - public void testJavaWithGzip() throws URISyntaxException, ClassNotFoundException, IOException { - executeTest(new GzipJavaUriSerializer(), "Gzip"); - } - - public void executeTest(Serializer serializer, String name) throws IOException { - CrawleableUri parsedUri; - byte[] data = serializer.serialize(uri); - System.out.println(String.format("%6s: data.length=%d", name, data.length)); - parsedUri = serializer.deserialize(data); - Assert.assertEquals(uri.getIpAddress(), parsedUri.getIpAddress()); - Assert.assertEquals(uri.getType(), parsedUri.getType()); - Assert.assertEquals(uri.getUri(), parsedUri.getUri()); - for (String key : uri.getData().keySet()) { - Assert.assertEquals(uri.getData(key), parsedUri.getData(key)); - } - Assert.assertEquals(uri.getData().size(), parsedUri.getData().size()); - } -} +package org.aksw.simba.squirrel.data.uri; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.Base64; +import java.util.Collection; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.data.uri.UriType; +import org.dice_research.squirrel.data.uri.serialize.Serializer; +import org.dice_research.squirrel.data.uri.serialize.gson.GsonUriSerializer; +import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; +import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; +import org.xerial.snappy.Snappy; + +import com.google.gson.Gson; + +@RunWith(Parameterized.class) +public class CrawleableUriSerializationTest { + + @Parameters + public static Collection data() throws Exception { + CrawleableUri temp = new CrawleableUri(new URI("http://localhost/test")); + temp.addData(Constants.URI_TYPE_KEY, Constants.URI_TYPE_VALUE_DEREF); + temp.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/json-ld"); + temp.addData(Constants.URI_HTTP_CHARSET_KEY, "utf-8"); + temp.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() + 100000L); + + return Arrays.asList(new Object[][] { { new CrawleableUri(new URI("http://localhost/test")) }, + { new CrawleableUri(new URI("http://google.de")) }, + { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), + UriType.DEREFERENCEABLE) }, + { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), + UriType.DUMP) }, + { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), + UriType.SPARQL) }, + { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("192.168.100.1"), + UriType.UNKNOWN) }, + { new CrawleableUri(new URI("http://dbpedia.org"), null, UriType.SPARQL) }, + { new CrawleableUri(new URI("http://google.de"), InetAddress.getByName("255.255.255.255")) }, + { temp } }); + } + + private CrawleableUri uri; + + public CrawleableUriSerializationTest(CrawleableUri uri) { + this.uri = uri; + } + + @Test + public void testGSON() throws URISyntaxException, ClassNotFoundException, IOException { + executeTest(new GsonUriSerializer(), "Gson"); + } + + @Test + public void testJavaWithSnappy() throws URISyntaxException, ClassNotFoundException, IOException { + executeTest(new SnappyJavaUriSerializer(), "Snappy"); + } + + @Test + public void testJavaWithGzip() throws URISyntaxException, ClassNotFoundException, IOException { + executeTest(new GzipJavaUriSerializer(), "Gzip"); + } + + public void executeTest(Serializer serializer, String name) throws IOException { + CrawleableUri parsedUri; + byte[] data = serializer.serialize(uri); + System.out.println(String.format("%6s: data.length=%d", name, data.length)); + parsedUri = serializer.deserialize(data); + Assert.assertEquals(uri.getIpAddress(), parsedUri.getIpAddress()); + Assert.assertEquals(uri.getType(), parsedUri.getType()); + Assert.assertEquals(uri.getUri(), parsedUri.getUri()); + for (String key : uri.getData().keySet()) { + Assert.assertEquals(uri.getData(key), parsedUri.getData(key)); + } + Assert.assertEquals(uri.getData().size(), parsedUri.getData().size()); + } +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index ce4578d71..0e1498f85 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -10,6 +10,7 @@ import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; +import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.graph.GraphLogger; import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -17,11 +18,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.URI; import java.net.UnknownHostException; import java.util.List; import java.util.Timer; import java.util.TimerTask; +import static com.sun.xml.internal.ws.policy.sourcemodel.wspolicy.XmlToken.Uri; + /** * Standard implementation of the {@link Frontier} interface containing a * {@link #queue} and a {@link #knownUriFilter}. @@ -57,6 +61,7 @@ public class FrontierImpl implements Frontier { * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. */ protected OutDatedUriRetriever outDatedUriRetriever; + protected SparqlhostConnector sparqlhostConnector; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to * identify URIs that already have been crawled. @@ -227,7 +232,7 @@ public FrontierImpl(UriNormalizer normalizer, timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = outDatedUriRetriever.getUriToRecrawl(); + List urisToRecrawl = sparqlhostConnector.getUriToRecrawl(); LOGGER.info("URI to recrawl" + urisToRecrawl); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 0247e94c3..4f4be1349 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -8,6 +8,8 @@ public class FrontierQueryGenerator { private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + private FrontierQueryGenerator() { } @@ -41,7 +43,8 @@ public static Query getOutdatedUrisQuery() { " } \n" + "}\n" + "}\n" + - "FILTER(?diff > \"60\"^^xsd:double)}\n" + + "FILTER(?diff >"+DEFAULT_GENERAL_RECRAWL_TIME + + ")}\n" + ""); Query query = QueryFactory.create(stringBuilder.toString()); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java index acfa54435..b5a392944 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java @@ -2,8 +2,10 @@ import java.io.Closeable; import java.io.IOException; +import java.net.URI; import java.util.List; +import com.sun.jndi.toolkit.url.Uri; import org.dice_research.squirrel.data.uri.CrawleableUri; public interface OutDatedUriRetriever extends Closeable { @@ -14,6 +16,5 @@ public interface OutDatedUriRetriever extends Closeable { * * @return The outdated {@link CrawleableUri}s. */ - public List getUriToRecrawl(); - + List getUriToRecrawl(); } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 814be652e..85761f036 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -1,21 +1,5 @@ package org.dice_research.squirrel.frontier.impl; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import java.net.InetAddress; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; - -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryDataset; -import org.apache.jena.query.Dataset; -import org.apache.jena.query.DatasetFactory; -import org.apache.jena.rdf.model.ModelFactory; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -23,13 +7,19 @@ import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; -import org.dice_research.squirrel.metadata.CrawlingActivity; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.net.InetAddress; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.*; + @SuppressWarnings("deprecation") public class FrontierImplTest { @@ -157,38 +147,6 @@ public void simlpeRecrawling() throws Exception { assertFalse("uri_2 has been found but was not expected", nextUris.contains(uri_2)); } - @Test - public void RecrawlingTest() throws Exception { - Dataset dataset = DatasetFactory.create(); - dataset.setDefaultModel(ModelFactory.createDefaultModel()); - QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); - UpdateExecutionFactory updateExecFactory = new UpdateExecutionFactoryDataset(dataset); - - CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); - uri.addData(Constants.UUID_KEY, "123"); - CrawlingActivity activity = new CrawlingActivity(uri, "http://example.org/testRecrawling"); - uri.addData(Constants.URI_CRAWLING_ACTIVITY, activity); - /* List uris = new ArrayList<>(); - CrawleableUri uri_1 = cuf.create(new URI("http://dbpedia.org/resource/uriThatShouldBeRecrawled")); - uri_1.addData("endedAtTime", "2019-07-06T17:04:02.864Z"); - CrawleableUri uri_2 = cuf.create(new URI("http://dbpedia.org/resource/normalUri")); - uri_2.addData("endedAtTime", "2019-07-06T19:38:02.864Z"); - uris.add(uri_1);Build); - } - - // Set the first URI as recrawlable - for (CrawleableUri uri : nextUris) { - if (uri.getUri().equals(uri_1.getUri())) { - uri.addData(Constants.URI_PREFERRED_RECRAWL_ON, System.currentTimeMillis() - 1); - } - } - Assert.assertNotNull(nextUris); - assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(2, nextUris.size()); - - */ - } - @After public void tearDown() throws Exception { filter.purge(); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java new file mode 100644 index 000000000..532a80adb --- /dev/null +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -0,0 +1,55 @@ +package org.dice_research.squirrel.frontier.impl; + +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; +import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryDataset; +import org.apache.jena.query.*; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.RDFNode; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; +import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; +import org.dice_research.squirrel.vocab.PROV_O; +import org.junit.Test; + +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; + +public class RecrawlingTest { + + @Test + public void RecrawlingTest() throws Exception { + List urisToRecrawl = new ArrayList<>(); + Dataset dataset = DatasetFactory.create(); + dataset.setDefaultModel(ModelFactory.createDefaultModel()); + QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); + UpdateExecutionFactory updateExecFactory = new UpdateExecutionFactoryDataset(dataset); + + CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); + uri.addData(String.valueOf(PROV_O.endedAtTime), "2019-12-15T21:40:11.173Z"); + CrawleableUri uri1 = new CrawleableUri(new URI("http://example.org/resource")); + uri1.addData(String.valueOf(PROV_O.endedAtTime), "2019-11-17T21:40:11.173Z"); + + try (SparqlhostConnector sink = new SparqlhostConnector(queryExecFactory, updateExecFactory)) { + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); + QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); + ResultSet rs = qe.execSelect(); + while (rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode outdatedUri = sol.get("uri"); + try { + urisToRecrawl.add(new CrawleableUri(new URI((outdatedUri.toString())))); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + } + qe.close(); + + + } + } + +} From 81f408654cfd83bf0491e62c84c40d3ebd115066 Mon Sep 17 00:00:00 2001 From: param-jot Date: Sun, 5 Jan 2020 16:54:37 +0100 Subject: [PATCH 029/102] Changed recrawling test and corresponding changes in CrawleableUri.java --- .../squirrel/data/uri/CrawleableUri.java | 20 +++++++++ .../recrawling/FrontierQueryGenerator.java | 7 +--- .../frontier/impl/FrontierImplTest.java | 9 +++- .../frontier/impl/RecrawlingTest.java | 41 +++++-------------- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java index 87f1f0ebf..e3c4c9e2b 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java @@ -1,5 +1,6 @@ package org.dice_research.squirrel.data.uri; +import org.apache.jena.rdf.model.Property; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -101,6 +102,7 @@ public static CrawleableUri fromByteBuffer(ByteBuffer buffer) { private UriType type = UriType.UNKNOWN; private Map data = new TreeMap<>(); + private Map prop = new TreeMap<>(); private long timestampNextCrawl; public CrawleableUri(URI uri) { @@ -144,6 +146,9 @@ public URI getUri() { public void addData(String key, Object data) { this.data.put(key, data); } + public void addProperty(Property key, Object prop) { + this.prop.put(key, prop); + } public Object getData(String key) { if (data.containsKey(key)) { @@ -152,14 +157,27 @@ public Object getData(String key) { return null; } } + public Object getProperty(Property key) { + if (prop.containsKey(key)) { + return prop.get(key); + } else { + return null; + } + } public Map getData() { return data; } + public Map getProperty() { + return prop; + } public void setData(Map data) { this.data = data; } + public void setProperty(Map prop) { + this.prop = prop; + } @Override public int hashCode() { @@ -245,4 +263,6 @@ public long getTimestampNextCrawl() { public void setTimestampNextCrawl(long timestampNextCrawl) { this.timestampNextCrawl = timestampNextCrawl; } + + } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 4f4be1349..e9c41850f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -7,12 +7,7 @@ public class FrontierQueryGenerator { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierQueryGenerator.class); - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; - - - private FrontierQueryGenerator() { - } + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; /** * Return outdated uris by comparing their endtime stamps. diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 85761f036..989a8d3f8 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -1,5 +1,10 @@ package org.dice_research.squirrel.frontier.impl; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; +import org.apache.jena.query.*; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.RDFNode; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -7,7 +12,9 @@ import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; +import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; +import org.dice_research.squirrel.vocab.PROV_O; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -51,7 +58,7 @@ public void getNextUris() throws Exception { queue.addUri(uris.get(1)); - // queue.addCrawleableUri(uris.get(1)); + // queue.addCrawleableUri(uris.get(1)); List nextUris = frontier.getNextUris(); List assertion = new ArrayList(); assertion.add(uris.get(1)); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 532a80adb..67bce42ae 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -2,54 +2,35 @@ import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryDataset; import org.apache.jena.query.*; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; -import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.vocab.PROV_O; import org.junit.Test; import java.net.URI; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.List; -public class RecrawlingTest { +import static org.junit.Assert.assertEquals; +public class RecrawlingTest { @Test public void RecrawlingTest() throws Exception { - List urisToRecrawl = new ArrayList<>(); Dataset dataset = DatasetFactory.create(); dataset.setDefaultModel(ModelFactory.createDefaultModel()); QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); - UpdateExecutionFactory updateExecFactory = new UpdateExecutionFactoryDataset(dataset); - CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); - uri.addData(String.valueOf(PROV_O.endedAtTime), "2019-12-15T21:40:11.173Z"); + uri.addProperty((PROV_O.endedAtTime), "2020-01-05T21:40:11.173Z"); CrawleableUri uri1 = new CrawleableUri(new URI("http://example.org/resource")); - uri1.addData(String.valueOf(PROV_O.endedAtTime), "2019-11-17T21:40:11.173Z"); - - try (SparqlhostConnector sink = new SparqlhostConnector(queryExecFactory, updateExecFactory)) { - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); - QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); - ResultSet rs = qe.execSelect(); - while (rs.hasNext()) { - QuerySolution sol = rs.nextSolution(); - RDFNode outdatedUri = sol.get("uri"); - try { - urisToRecrawl.add(new CrawleableUri(new URI((outdatedUri.toString())))); - } catch (URISyntaxException e) { - e.printStackTrace(); - } - } - qe.close(); - - + uri1.addProperty((PROV_O.endedAtTime), "2019-12-25T21:40:11.173Z"); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); + QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); + ResultSet rs = qe.execSelect(); + while (rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode outdatedUri = sol.get("uri"); } + assertEquals(1,uri1); } - } From 4de8040b4cbe907ce16238ed592f3289a8e127a7 Mon Sep 17 00:00:00 2001 From: param-jot Date: Mon, 6 Jan 2020 15:43:09 +0100 Subject: [PATCH 030/102] minor changes --- .../squirrel/frontier/impl/FrontierImpl.java | 3 --- .../recrawling/FrontierQueryGenerator.java | 10 ++------- .../recrawling/OutDatedUriRetriever.java | 3 +-- .../recrawling/SparqlhostConnector.java | 22 +++++++++---------- .../frontier/impl/RecrawlingTest.java | 17 ++++++++------ 5 files changed, 23 insertions(+), 32 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 0e1498f85..d38cd3935 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -18,14 +18,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.URI; import java.net.UnknownHostException; import java.util.List; import java.util.Timer; import java.util.TimerTask; -import static com.sun.xml.internal.ws.policy.sourcemodel.wspolicy.XmlToken.Uri; - /** * Standard implementation of the {@link Frontier} interface containing a * {@link #queue} and a {@link #knownUriFilter}. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index e9c41850f..94618a71e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -2,23 +2,18 @@ import org.apache.jena.query.Query; import org.apache.jena.query.QueryFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class FrontierQueryGenerator { - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; /** * Return outdated uris by comparing their endtime stamps. * @return All triples with time stamp in the default graph. */ - - public static Query getOutdatedUrisQuery() { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("PREFIX sq: \n" + + Query query = QueryFactory.create("PREFIX sq: \n" + "PREFIX prov: \n" + "PREFIX xsd: " + "SELECT ?uri WHERE { \n "+ @@ -42,7 +37,6 @@ public static Query getOutdatedUrisQuery() { ")}\n" + ""); - Query query = QueryFactory.create(stringBuilder.toString()); return query; } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java index b5a392944..3ccdcbb90 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java @@ -8,8 +8,7 @@ import com.sun.jndi.toolkit.url.Uri; import org.dice_research.squirrel.data.uri.CrawleableUri; -public interface OutDatedUriRetriever extends Closeable { - +public interface OutDatedUriRetriever{ /** * Returns all {@link CrawleableUri}s which are crawled a week ago and have to be recrawled. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index dac1a5f3c..5f893e540 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -21,20 +21,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; -public class SparqlhostConnector implements OutDatedUriRetriever { +public class SparqlhostConnector implements OutDatedUriRetriever{ private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); /** - * The Query factory used to query the SPARQL endpoint. + * SparqlhostConnector creates a connection to the SPARQL endpoint and Query factory used to generate a query. */ - protected QueryExecutionFactory queryExecFactory = null; + QueryExecutionFactory queryExecFactory; List urisToRecrawl = new ArrayList<>(); public SparqlhostConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { @@ -42,13 +41,13 @@ public SparqlhostConnector(QueryExecutionFactory queryExecFactory, UpdateExecuti LOGGER.info("Connected"); } - public static SparqlhostConnector create(String sparqlEndpointUrl) { + public SparqlhostConnector create(String sparqlEndpointUrl) { return create(sparqlEndpointUrl, null, null); } - public static SparqlhostConnector create(String sparqlEndpointUrl, String username, String password) { - QueryExecutionFactory queryExecFactory = null; - UpdateExecutionFactory updateExecFactory = null; + public SparqlhostConnector create(String sparqlEndpointUrl, String username, String password) { + QueryExecutionFactory queryExecFactory; + UpdateExecutionFactory updateExecFactory; if (username != null && password != null) { // Create the factory with the credentials final Credentials credentials = new UsernamePasswordCredentials(username, password); @@ -87,6 +86,9 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { } + /** + * @return list of outdated URIs + */ @Override public List getUriToRecrawl() { Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); @@ -105,9 +107,5 @@ public List getUriToRecrawl() { return urisToRecrawl; } - @Override - public void close() throws IOException { - getUriToRecrawl(); - } } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 67bce42ae..3e00a0b88 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -5,6 +5,7 @@ import org.apache.jena.query.*; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.sparql.resultset.ResultSetCompare; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; import org.dice_research.squirrel.vocab.PROV_O; @@ -13,6 +14,7 @@ import java.net.URI; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class RecrawlingTest { @Test @@ -22,15 +24,16 @@ public void RecrawlingTest() throws Exception { QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); uri.addProperty((PROV_O.endedAtTime), "2020-01-05T21:40:11.173Z"); + CrawleableUri uri1 = new CrawleableUri(new URI("http://example.org/resource")); uri1.addProperty((PROV_O.endedAtTime), "2019-12-25T21:40:11.173Z"); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); - QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); - ResultSet rs = qe.execSelect(); - while (rs.hasNext()) { - QuerySolution sol = rs.nextSolution(); - RDFNode outdatedUri = sol.get("uri"); + + Query query = QueryFactory.create(FrontierQueryGenerator.getOutdatedUrisQuery()); + try { + QueryExecution execution = queryExecFactory.createQueryExecution(query); + ResultSet rs = execution.execSelect(); + assertEquals(true, uri1); + } catch (Exception e) { } - assertEquals(1,uri1); } } From ea36c84ed50417992b143c12c21336eeba73af33 Mon Sep 17 00:00:00 2001 From: param-jot Date: Tue, 7 Jan 2020 13:32:39 +0100 Subject: [PATCH 031/102] Updated test case and remove unused code --- .../squirrel/data/uri/CrawleableUri.java | 20 ++------ .../frontier/impl/FrontierImplTest.java | 8 +--- .../frontier/impl/RecrawlingTest.java | 48 ++++++++++++------- squirrel.frontier/src/test/resources/test.ttl | 37 ++++++++++++++ 4 files changed, 72 insertions(+), 41 deletions(-) create mode 100644 squirrel.frontier/src/test/resources/test.ttl diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java index e3c4c9e2b..812618660 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java @@ -102,7 +102,6 @@ public static CrawleableUri fromByteBuffer(ByteBuffer buffer) { private UriType type = UriType.UNKNOWN; private Map data = new TreeMap<>(); - private Map prop = new TreeMap<>(); private long timestampNextCrawl; public CrawleableUri(URI uri) { @@ -146,9 +145,7 @@ public URI getUri() { public void addData(String key, Object data) { this.data.put(key, data); } - public void addProperty(Property key, Object prop) { - this.prop.put(key, prop); - } + public Object getData(String key) { if (data.containsKey(key)) { @@ -157,27 +154,16 @@ public Object getData(String key) { return null; } } - public Object getProperty(Property key) { - if (prop.containsKey(key)) { - return prop.get(key); - } else { - return null; - } - } + public Map getData() { return data; } - public Map getProperty() { - return prop; - } public void setData(Map data) { this.data = data; } - public void setProperty(Map prop) { - this.prop = prop; - } + @Override public int hashCode() { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 989a8d3f8..7857534b7 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -1,10 +1,6 @@ package org.dice_research.squirrel.frontier.impl; -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; -import org.apache.jena.query.*; -import org.apache.jena.rdf.model.ModelFactory; -import org.apache.jena.rdf.model.RDFNode; + import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.MongoDBBasedTest; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -12,9 +8,7 @@ import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; -import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; -import org.dice_research.squirrel.vocab.PROV_O; import org.junit.After; import org.junit.Assert; import org.junit.Before; diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 3e00a0b88..a682f7729 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -3,37 +3,51 @@ import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; import org.apache.jena.query.*; +import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; import org.apache.jena.sparql.resultset.ResultSetCompare; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; +import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; import org.dice_research.squirrel.vocab.PROV_O; +import org.junit.Assert; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.InputStream; import java.net.URI; +import java.net.URISyntaxException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; public class RecrawlingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RecrawlingTest.class); + @Test - public void RecrawlingTest() throws Exception { + public void Recrawling() throws Exception { Dataset dataset = DatasetFactory.create(); - dataset.setDefaultModel(ModelFactory.createDefaultModel()); + dataset.setDefaultModel(ModelFactory.createDefaultModel().read("test.ttl")); QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); - CrawleableUri uri = new CrawleableUri(new URI("http://example.org/dataset")); - uri.addProperty((PROV_O.endedAtTime), "2020-01-05T21:40:11.173Z"); - - CrawleableUri uri1 = new CrawleableUri(new URI("http://example.org/resource")); - uri1.addProperty((PROV_O.endedAtTime), "2019-12-25T21:40:11.173Z"); - - Query query = QueryFactory.create(FrontierQueryGenerator.getOutdatedUrisQuery()); - try { - QueryExecution execution = queryExecFactory.createQueryExecution(query); - ResultSet rs = execution.execSelect(); - assertEquals(true, uri1); - } catch (Exception e) { + + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); + QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); + ResultSet rs = qe.execSelect(); + while(rs.hasNext()) { + QuerySolution sol = rs.nextSolution(); + RDFNode outdatedUri = sol.get("uri"); + LOGGER.info(String.valueOf(outdatedUri)); + assertEquals("Expected URI", outdatedUri.asResource().getURI(), "http://d-nb.info/gnd/4042012-7"); + assertEquals("Expected URI", outdatedUri.asResource().getURI(), "http://eu.dbpedia.org/resource/New_York_(estatua)"); + assertFalse("Not expecting any URI", rs.hasNext()); + } + qe.close(); } + + } -} + diff --git a/squirrel.frontier/src/test/resources/test.ttl b/squirrel.frontier/src/test/resources/test.ttl new file mode 100644 index 000000000..3ea33da4a --- /dev/null +++ b/squirrel.frontier/src/test/resources/test.ttl @@ -0,0 +1,37 @@ +@prefix sq-s: . +@prefix owl: . +@prefix rdf: . +@prefix sq-a: . +@prefix rdfs: . +@prefix sq-m: . +@prefix dcat: . +@prefix prov: . +@prefix sq-g: . +@prefix sq-w: . +@prefix sq: . +@prefix dc: . + + +sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797 + prov:startedAtTime "2019-12-03T09:42:54.406Z"^^ ; + prov:endedAtTime "2019-12-03T09:43:00.15Z"^^ ; + sq:approxNumberOfTriples "27"^^ ; + prov:qualifiedAssociation sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797_workerAssoc ; + sq:crawled ; + rdf:type prov:Activity ; + prov:hadPlan sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797_plan ; + prov:wasAssociatedWith sq-w:104fbe9d-b325-4dbb-97f0-22626d810e9f ; + sq:uriHostedOn ; + sq:status "SUCCESSFUL" . + +sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2 + sq:crawled ; + sq:uriHostedOn ; + rdf:type prov:Activity ; + prov:hadPlan sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_plan ; + prov:endedAtTime "2020-01-07T09:45:53.318Z"^^ ; + prov:startedAtTime "2019-12-03T09:45:52.864Z"^^ ; + prov:wasAssociatedWith sq-w:25c73c0f-5b82-42fb-b913-3980d7cf9f0e ; + sq:status "SUCCESSFUL" ; + prov:qualifiedAssociation sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_workerAssoc ; + sq:approxNumberOfTriples "13"^^ . From 6b6b159adad30e34c8a1b480615e72a394dd9a93 Mon Sep 17 00:00:00 2001 From: param-jot Date: Thu, 9 Jan 2020 11:51:11 +0100 Subject: [PATCH 032/102] removing unused code --- .../squirrel/components/FrontierComponent.java | 1 - .../frontier/recrawling/SparqlhostConnector.java | 4 ++-- .../squirrel/frontier/impl/RecrawlingTest.java | 15 +++++++-------- squirrel.frontier/src/test/resources/test.ttl | 1 - 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index ebae7888f..2bbc224c8 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -59,7 +59,6 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("queueBean") @Autowired protected UriQueue queue; - protected String dataSetQuery = "select ?s ?p ?o where {?s ?p ?o} LIMIT 100 "; @Qualifier("knowUriFilterBean") @Autowired private KnownUriFilter knownUriFilter; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java index 5f893e540..5cc20b2a2 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java @@ -36,7 +36,7 @@ public class SparqlhostConnector implements OutDatedUriRetriever{ QueryExecutionFactory queryExecFactory; List urisToRecrawl = new ArrayList<>(); - public SparqlhostConnector(QueryExecutionFactory queryExecFactory, UpdateExecutionFactory updateExecFactory) { + public SparqlhostConnector(QueryExecutionFactory queryExecFactory) { this.queryExecFactory = queryExecFactory; LOGGER.info("Connected"); } @@ -82,7 +82,7 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); } - return new SparqlhostConnector(queryExecFactory, updateExecFactory); + return new SparqlhostConnector(queryExecFactory); } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index a682f7729..ee1c92185 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -1,7 +1,9 @@ package org.dice_research.squirrel.frontier.impl; +import javafx.beans.property.ReadOnlyDoubleWrapper; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; +import org.apache.jena.graph.Node; import org.apache.jena.query.*; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.ModelFactory; @@ -9,6 +11,7 @@ import org.apache.jena.rdf.model.Resource; import org.apache.jena.riot.Lang; import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.sparql.core.QuerySolutionBase; import org.apache.jena.sparql.resultset.ResultSetCompare; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; @@ -37,17 +40,13 @@ public void Recrawling() throws Exception { Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); - while(rs.hasNext()) { - QuerySolution sol = rs.nextSolution(); - RDFNode outdatedUri = sol.get("uri"); + QuerySolution solu = rs.nextSolution(); + RDFNode outdatedUri = solu.get("uri"); LOGGER.info(String.valueOf(outdatedUri)); assertEquals("Expected URI", outdatedUri.asResource().getURI(), "http://d-nb.info/gnd/4042012-7"); - assertEquals("Expected URI", outdatedUri.asResource().getURI(), "http://eu.dbpedia.org/resource/New_York_(estatua)"); assertFalse("Not expecting any URI", rs.hasNext()); - } + qe.close(); } - - - } +} diff --git a/squirrel.frontier/src/test/resources/test.ttl b/squirrel.frontier/src/test/resources/test.ttl index 3ea33da4a..150002976 100644 --- a/squirrel.frontier/src/test/resources/test.ttl +++ b/squirrel.frontier/src/test/resources/test.ttl @@ -11,7 +11,6 @@ @prefix sq: . @prefix dc: . - sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797 prov:startedAtTime "2019-12-03T09:42:54.406Z"^^ ; prov:endedAtTime "2019-12-03T09:43:00.15Z"^^ ; From 37923dea1ad9cc05fbb9ef262f0662132c1894ec Mon Sep 17 00:00:00 2001 From: param-jot Date: Thu, 9 Jan 2020 12:51:10 +0100 Subject: [PATCH 033/102] removing unused code --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 39c704514..057e6fae9 100644 --- a/pom.xml +++ b/pom.xml @@ -41,7 +41,6 @@ squirrel.frontier squirrel.mockup squirrel.worker - squirrel.crawled.graph From a671171fbacda14a802915c8211864d889df4593 Mon Sep 17 00:00:00 2001 From: param-jot Date: Thu, 9 Jan 2020 13:33:43 +0100 Subject: [PATCH 034/102] removing unused import and update test.ttl and query --- .../recrawling/FrontierQueryGenerator.java | 5 ++--- .../squirrel/frontier/impl/RecrawlingTest.java | 17 +---------------- squirrel.frontier/src/test/resources/test.ttl | 5 +++-- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 94618a71e..f8e6a9793 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -5,7 +5,7 @@ public class FrontierQueryGenerator { - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 60 * 60 * 24 * 7; /** * Return outdated uris by comparing their endtime stamps. @@ -15,8 +15,7 @@ public class FrontierQueryGenerator { public static Query getOutdatedUrisQuery() { Query query = QueryFactory.create("PREFIX sq: \n" + "PREFIX prov: \n" + - "PREFIX xsd: " - + "SELECT ?uri WHERE { \n "+ + "SELECT ?uri WHERE { \n "+ "{\n" + "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + "WHERE{\n" + diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index ee1c92185..a8125d003 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -1,31 +1,15 @@ package org.dice_research.squirrel.frontier.impl; -import javafx.beans.property.ReadOnlyDoubleWrapper; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; -import org.apache.jena.graph.Node; import org.apache.jena.query.*; -import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; -import org.apache.jena.rdf.model.Resource; -import org.apache.jena.riot.Lang; -import org.apache.jena.riot.RDFDataMgr; -import org.apache.jena.sparql.core.QuerySolutionBase; -import org.apache.jena.sparql.resultset.ResultSetCompare; -import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; -import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; -import org.dice_research.squirrel.vocab.PROV_O; -import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.InputStream; -import java.net.URI; -import java.net.URISyntaxException; - import static org.junit.Assert.*; public class RecrawlingTest { @@ -40,6 +24,7 @@ public void Recrawling() throws Exception { Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); + assertTrue("There should be at least one result", rs.hasNext()); QuerySolution solu = rs.nextSolution(); RDFNode outdatedUri = solu.get("uri"); LOGGER.info(String.valueOf(outdatedUri)); diff --git a/squirrel.frontier/src/test/resources/test.ttl b/squirrel.frontier/src/test/resources/test.ttl index 150002976..536e8c8e1 100644 --- a/squirrel.frontier/src/test/resources/test.ttl +++ b/squirrel.frontier/src/test/resources/test.ttl @@ -11,6 +11,7 @@ @prefix sq: . @prefix dc: . + sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797 prov:startedAtTime "2019-12-03T09:42:54.406Z"^^ ; prov:endedAtTime "2019-12-03T09:43:00.15Z"^^ ; @@ -28,8 +29,8 @@ sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2 sq:uriHostedOn ; rdf:type prov:Activity ; prov:hadPlan sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_plan ; - prov:endedAtTime "2020-01-07T09:45:53.318Z"^^ ; - prov:startedAtTime "2019-12-03T09:45:52.864Z"^^ ; + prov:endedAtTime "2999-01-07T09:45:53.318Z"^^ ; + prov:startedAtTime "2998-12-03T09:45:52.864Z"^^ ; prov:wasAssociatedWith sq-w:25c73c0f-5b82-42fb-b913-3980d7cf9f0e ; sq:status "SUCCESSFUL" ; prov:qualifiedAssociation sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_workerAssoc ; From dcfe1298b4d6920cdfa1bdd43f70bd03b36ae35f Mon Sep 17 00:00:00 2001 From: param-jot Date: Fri, 10 Jan 2020 11:06:32 +0100 Subject: [PATCH 035/102] Updating outDatedUri query and test case --- spring-config/frontier-context-sparql.xml | 2 +- .../components/FrontierComponent.java | 3 -- .../squirrel/frontier/impl/FrontierImpl.java | 6 +-- .../recrawling/FrontierQueryGenerator.java | 27 +++++++------ ...a => SparqlBasedOutDatedUriRetriever.java} | 22 +++++------ .../frontier/impl/RecrawlingTest.java | 21 +++++++--- squirrel.frontier/src/test/resources/test.ttl | 39 ++++--------------- 7 files changed, 53 insertions(+), 67 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/{SparqlhostConnector.java => SparqlBasedOutDatedUriRetriever.java} (82%) diff --git a/spring-config/frontier-context-sparql.xml b/spring-config/frontier-context-sparql.xml index ba9c902ae..d57e75340 100644 --- a/spring-config/frontier-context-sparql.xml +++ b/spring-config/frontier-context-sparql.xml @@ -46,7 +46,7 @@ - + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 2bbc224c8..d7d9e778c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -1,6 +1,5 @@ package org.dice_research.squirrel.components; -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; @@ -50,9 +49,7 @@ @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { - public static final boolean RECRAWLING_ACTIVE = true; private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); - protected static QueryExecutionFactory queryExecFactory = null; private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index d38cd3935..789d47ae5 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -10,7 +10,7 @@ import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; -import org.dice_research.squirrel.frontier.recrawling.SparqlhostConnector; +import org.dice_research.squirrel.frontier.recrawling.SparqlBasedOutDatedUriRetriever; import org.dice_research.squirrel.graph.GraphLogger; import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -58,7 +58,7 @@ public class FrontierImpl implements Frontier { * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. */ protected OutDatedUriRetriever outDatedUriRetriever; - protected SparqlhostConnector sparqlhostConnector; + protected SparqlBasedOutDatedUriRetriever sparqlBasedOutDatedUriRetriever; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to * identify URIs that already have been crawled. @@ -229,7 +229,7 @@ public FrontierImpl(UriNormalizer normalizer, timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = sparqlhostConnector.getUriToRecrawl(); + List urisToRecrawl = sparqlBasedOutDatedUriRetriever.getUriToRecrawl(); LOGGER.info("URI to recrawl" + urisToRecrawl); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index f8e6a9793..cc5727af1 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -3,38 +3,41 @@ import org.apache.jena.query.Query; import org.apache.jena.query.QueryFactory; -public class FrontierQueryGenerator { - - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 60 * 60 * 24 * 7; +import java.text.SimpleDateFormat; +import java.util.Calendar; +public class FrontierQueryGenerator { /** * Return outdated uris by comparing their endtime stamps. * @return All triples with time stamp in the default graph. */ - public static Query getOutdatedUrisQuery() { + public static Query getOutdatedUrisQuery(Calendar date) { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"); Query query = QueryFactory.create("PREFIX sq: \n" + "PREFIX prov: \n" + - "SELECT ?uri WHERE { \n "+ + "PREFIX xsd: \n" + + "\n" + + "SELECT ?url \n" + + "WHERE{\n" + "{\n" + - "SELECT ?uri ?endtime (NOW() - (?endtime) AS ?diff)\n" + + "SELECT ?url ?endtime\n" + "WHERE{\n" + "\n" + " {\n" + - " SELECT ?uri (MAX(?timestamp) as ?endtime)\n" + + " SELECT ?url (MAX(?timestamp) as ?endtime)\n" + " WHERE\n" + " { \n" + - " ?s sq:crawled ?uri ;\n" + + " ?s sq:crawled ?url ;\n" + " prov:endedAtTime ?timestamp.\n" + "\n" + " }\n" + - " GROUP BY ?uri\n" + + " GROUP BY ?url\n" + " } \n" + "}\n" + "}\n" + - "FILTER(?diff >"+DEFAULT_GENERAL_RECRAWL_TIME + - ")}\n" + - ""); + "FILTER(?endtime < \"" + dateFormat.format(date.getTime()) + "\"^^xsd:dateTime)\n" + + "}"); return query; } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java similarity index 82% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index 5cc20b2a2..d2678c7de 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlhostConnector.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -2,7 +2,6 @@ import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactoryHttp; import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; import org.apache.http.auth.AuthScope; import org.apache.http.auth.Credentials; @@ -24,28 +23,29 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.Calendar; import java.util.List; -public class SparqlhostConnector implements OutDatedUriRetriever{ +public class SparqlBasedOutDatedUriRetriever implements OutDatedUriRetriever{ - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlhostConnector.class); + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedOutDatedUriRetriever.class); /** - * SparqlhostConnector creates a connection to the SPARQL endpoint and Query factory used to generate a query. + * SparqlBasedOutDatedUriRetriever creates a connection to the SPARQL endpoint and Query factory used to generate a query. */ QueryExecutionFactory queryExecFactory; List urisToRecrawl = new ArrayList<>(); - public SparqlhostConnector(QueryExecutionFactory queryExecFactory) { + public SparqlBasedOutDatedUriRetriever(QueryExecutionFactory queryExecFactory) { this.queryExecFactory = queryExecFactory; LOGGER.info("Connected"); } - public SparqlhostConnector create(String sparqlEndpointUrl) { + public SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl) { return create(sparqlEndpointUrl, null, null); } - public SparqlhostConnector create(String sparqlEndpointUrl, String username, String password) { + public SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl, String username, String password) { QueryExecutionFactory queryExecFactory; UpdateExecutionFactory updateExecFactory; if (username != null && password != null) { @@ -77,12 +77,10 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { }; queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl, new DatasetDescription(), authenticator); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl, authenticator); } else { queryExecFactory = new QueryExecutionFactoryHttp(sparqlEndpointUrl); - updateExecFactory = new UpdateExecutionFactoryHttp(sparqlEndpointUrl); } - return new SparqlhostConnector(queryExecFactory); + return new SparqlBasedOutDatedUriRetriever(queryExecFactory); } @@ -91,7 +89,9 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { */ @Override public List getUriToRecrawl() { - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); + Calendar date = Calendar.getInstance(); + date.add(Calendar.DAY_OF_YEAR, 7); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); while (rs.hasNext()) { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index a8125d003..1043a1932 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -10,25 +10,34 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Calendar; + import static org.junit.Assert.*; public class RecrawlingTest { private static final Logger LOGGER = LoggerFactory.getLogger(RecrawlingTest.class); @Test - public void Recrawling() throws Exception { + public void Recrawling(){ Dataset dataset = DatasetFactory.create(); dataset.setDefaultModel(ModelFactory.createDefaultModel().read("test.ttl")); QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); - - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(); + Calendar date = Calendar.getInstance(); + date.set(Calendar.SECOND, 0); + date.set(Calendar.MINUTE, 0); + date.set(Calendar.HOUR, 7); + date.set(Calendar.AM_PM, Calendar.AM); + date.set(Calendar.MONTH, Calendar.JANUARY); + date.set(Calendar.DAY_OF_MONTH, 3); + date.set(Calendar.YEAR, 2020); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); assertTrue("There should be at least one result", rs.hasNext()); QuerySolution solu = rs.nextSolution(); - RDFNode outdatedUri = solu.get("uri"); - LOGGER.info(String.valueOf(outdatedUri)); - assertEquals("Expected URI", outdatedUri.asResource().getURI(), "http://d-nb.info/gnd/4042012-7"); + LOGGER.info("Solution: {}", solu); + RDFNode outdatedUri = solu.get("url"); + assertEquals("Expected URI", "http://dbpedia.org/ontology/language", outdatedUri.asResource().getURI()); assertFalse("Not expecting any URI", rs.hasNext()); qe.close(); diff --git a/squirrel.frontier/src/test/resources/test.ttl b/squirrel.frontier/src/test/resources/test.ttl index 536e8c8e1..bf675e0ef 100644 --- a/squirrel.frontier/src/test/resources/test.ttl +++ b/squirrel.frontier/src/test/resources/test.ttl @@ -1,37 +1,14 @@ -@prefix sq-s: . -@prefix owl: . -@prefix rdf: . @prefix sq-a: . -@prefix rdfs: . -@prefix sq-m: . -@prefix dcat: . @prefix prov: . -@prefix sq-g: . -@prefix sq-w: . @prefix sq: . -@prefix dc: . +@prefix xsd: . +sq-a:95ef9851-46af-42a1-b78a-3d2d73bf844b + prov:endedAtTime "2019-11-17T21:40:05.155Z"^^xsd:dateTime ; + sq:crawled . -sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797 - prov:startedAtTime "2019-12-03T09:42:54.406Z"^^ ; - prov:endedAtTime "2019-12-03T09:43:00.15Z"^^ ; - sq:approxNumberOfTriples "27"^^ ; - prov:qualifiedAssociation sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797_workerAssoc ; - sq:crawled ; - rdf:type prov:Activity ; - prov:hadPlan sq-a:f8a97b22-67a6-4fe5-8ff4-e2ccd1322797_plan ; - prov:wasAssociatedWith sq-w:104fbe9d-b325-4dbb-97f0-22626d810e9f ; - sq:uriHostedOn ; - sq:status "SUCCESSFUL" . -sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2 - sq:crawled ; - sq:uriHostedOn ; - rdf:type prov:Activity ; - prov:hadPlan sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_plan ; - prov:endedAtTime "2999-01-07T09:45:53.318Z"^^ ; - prov:startedAtTime "2998-12-03T09:45:52.864Z"^^ ; - prov:wasAssociatedWith sq-w:25c73c0f-5b82-42fb-b913-3980d7cf9f0e ; - sq:status "SUCCESSFUL" ; - prov:qualifiedAssociation sq-a:4b8e6c18-e4dc-4365-b425-88d7db6da2f2_workerAssoc ; - sq:approxNumberOfTriples "13"^^ . +sq-a:300bae75-5208-4144-8f3d-eeeccf2c90e8 + sq:crawled ; + prov:endedAtTime "2020-01-09T12:53:24.607Z"^^xsd:dateTime. + From 479b14130d9c4b3fcc980ba0a3e79f36315d581a Mon Sep 17 00:00:00 2001 From: param-jot Date: Fri, 10 Jan 2020 11:33:59 +0100 Subject: [PATCH 036/102] Updating FrontierImpl.java --- .../squirrel/frontier/impl/FrontierImpl.java | 244 ++++++++++-------- 1 file changed, 137 insertions(+), 107 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 789d47ae5..78d24d174 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -1,5 +1,10 @@ package org.dice_research.squirrel.frontier.impl; +import java.net.UnknownHostException; +import java.util.List; +import java.util.Timer; +import java.util.TimerTask; + import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; @@ -18,11 +23,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.UnknownHostException; -import java.util.List; -import java.util.Timer; -import java.util.TimerTask; - /** * Standard implementation of the {@link Frontier} interface containing a * {@link #queue} and a {@link #knownUriFilter}. @@ -31,39 +31,30 @@ */ public class FrontierImpl implements Frontier { - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - /** - * Default value for {@link #timerPeriod}. - */ - private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; - public long generalRecrawlTime; - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ /** * {@link UriNormalizer} used to transform given URIs into a normal form. */ protected UriNormalizer normalizer; + /** * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ protected KnownUriFilter knownUriFilter; + /** * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. */ protected OutDatedUriRetriever outDatedUriRetriever; protected SparqlBasedOutDatedUriRetriever sparqlBasedOutDatedUriRetriever; + /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to * identify URIs that already have been crawled. */ protected URIReferences uriReferences = null; + /** * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. */ @@ -81,148 +72,184 @@ public class FrontierImpl implements Frontier { * {@link GraphLogger} that can be added to log the crawled graph. */ protected GraphLogger graphLogger; + /** * Indicates whether recrawling is active. */ private boolean doesRecrawling; + /** * The timer that schedules the recrawling. */ private Timer timerRecrawling; + + /** + * Time (in milliseconds) after which uris will be recrawled (only used if no + * specific time is configured for a URI). + */ + private static long generalRecrawlTime; + /** * Time interval(in milliseconds) at which the check for outdated uris is * performed. */ private long timerPeriod; + /** + * Default value for {@link #generalRecrawlTime} (one week). + */ + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + + /** + * Default value for {@link #timerPeriod}. + */ + private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; + /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. + * @param graphLogger + * {@link GraphLogger} used to log graphs. + * @param doesRecrawling + * used to select if URIs should be recrawled. + * @param generalRecrawlTime + * used to select the general Time after URIs should be recrawled. If + * Value is null the default Time is used. + * @param timerPeriod + * used to select if URIs should be recrawled. */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod, outDatedUriRetriever); + this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); } /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. + * @param doesRecrawling + * used to select if URIs should be recrawled. + * @param generalRecrawlTime + * used to select the general Time after URIs should be recrawled. If + * Value is null the default Time is used. + * @param timerPeriod + * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, - timerPeriod, outDatedUriRetriever); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetriever outDatedUriRetriever) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod, outDatedUriRetriever); } /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param uriReferences + * {@link URIReferences} used to manage URI references + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. + * @param doesRecrawling + * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, + UriQueue queue, boolean doesRecrawling) { + this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD); } /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. - * @param doesRecrawling Value for {@link #doesRecrawling}. + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. + * @param doesRecrawling + * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, + boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD, outDatedUriRetriever); } /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs into - * a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that already - * have been crawled. - * @param queue {@link UriQueue} used to manage the URIs that should be - * crawled. + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, outDatedUriRetriever); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetriever outDatedUriRetriever) { + this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); } /** * Constructor. * - * @param normalizer {@link UriNormalizer} used to transform given URIs - * into a normal form - * @param knownUriFilter {@link UriFilter} used to identify URIs that - * already have been crawled. - * @param uriReferences {@link URIReferences} used to manage URI references - * @param queue {@link UriQueue} used to manage the URIs that - * should be crawled. - * @param graphLogger {@link GraphLogger} used to log graphs. - * @param doesRecrawling used to select if URIs should be recrawled. - * @param generalRecrawlTime used to select the general Time after URIs should - * be recrawled. If Value is null the default Time is - * used. - * @param timerPeriod used to select if URIs should be recrawled. - * @param outDatedUriRetriever + * @param normalizer + * {@link UriNormalizer} used to transform given URIs into a normal + * form + * @param knownUriFilter + * {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param uriReferences + * {@link URIReferences} used to manage URI references + * @param queue + * {@link UriQueue} used to manage the URIs that should be crawled. + * @param graphLogger + * {@link GraphLogger} used to log graphs. + * @param doesRecrawling + * used to select if URIs should be recrawled. + * @param generalRecrawlTime + * used to select the general Time after URIs should be recrawled. If + * Value is null the default Time is used. + * @param timerPeriod + * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, - KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, - boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, OutDatedUriRetriever outDatedUriRetriever) { + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, + UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, + long timerPeriod) { this.normalizer = normalizer; this.knownUriFilter = knownUriFilter; this.uriReferences = uriReferences; this.queue = queue; this.uriProcessor = new UriProcessor(); this.graphLogger = graphLogger; - this.outDatedUriRetriever = outDatedUriRetriever; this.queue.open(); this.doesRecrawling = doesRecrawling; this.timerPeriod = timerPeriod; - this.generalRecrawlTime = generalRecrawlTime; + FrontierImpl.generalRecrawlTime = generalRecrawlTime; if (this.doesRecrawling) { timerRecrawling = new Timer(); @@ -237,15 +264,13 @@ public void run() { } } - public long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - @Override public List getNextUris() { -// if(terminationCheck.shouldFrontierTerminate(this)) { -// LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); -// } + + // if(terminationCheck.shouldFrontierTerminate(this)) { + // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); + // } + return queue.getNextUris(); } @@ -290,6 +315,7 @@ public void addNewUri(CrawleableUri uri) { } } + @Override public void crawlingDone(List uris) { LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); @@ -349,6 +375,10 @@ public void close() { timerRecrawling.cancel(); } + public static long getGeneralRecrawlTime() { + return generalRecrawlTime; + } + /** * Getter for the {@link #queue}. * From c6f6dd1ca1e4a35689b780ce79519b7ac079c296 Mon Sep 17 00:00:00 2001 From: param-jot Date: Fri, 10 Jan 2020 11:39:32 +0100 Subject: [PATCH 037/102] Updating RDBKnownUriFilter.java --- .../squirrel/data/uri/filter/RDBKnownUriFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java index f1ba4a090..3b08ae054 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RDBKnownUriFilter.java @@ -32,7 +32,7 @@ @SuppressWarnings("deprecation") public class RDBKnownUriFilter implements KnownUriFilter, Closeable, UriHashCustodian { private static final Logger LOGGER = LoggerFactory.getLogger(RDBKnownUriFilter.class); - FrontierImpl frontierImpl; + private RDBConnector connector = null; private RethinkDB r; @@ -125,7 +125,7 @@ public List getOutdatedUris() { // get all uris with the following property: // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - long generalRecrawlTime = Math.max(frontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, frontierImpl.getGeneralRecrawlTime()); + long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); Cursor cursor = r.db(DATABASE_NAME) .table(TABLE_NAME) From 514f00e0e32e0c908f7c17d61cb69af3c2d2014b Mon Sep 17 00:00:00 2001 From: param-jot Date: Fri, 10 Jan 2020 11:51:34 +0100 Subject: [PATCH 038/102] Removing unsed code --- .../dice_research/squirrel/components/FrontierComponent.java | 2 +- .../squirrel/frontier/impl/ExtendedFrontierImpl.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index d7d9e778c..7032b8a66 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -99,7 +99,7 @@ public void init() throws Exception { knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); } // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling, outDatedUriRetriever); + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index fad1e8c29..25c954ad8 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -62,8 +62,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { - super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, outDatedUriRetriever); + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling) { + super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling); } @Override From 6e23d5e15fdb45bef0d6d51cdea0c647c8cda49e Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:26:18 +0100 Subject: [PATCH 039/102] Fix the formatting of test parameters --- .../squirrel/analyzer/impl/RDFAnalyzerTest.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index a37ba33bb..2cd3c2661 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -38,16 +38,20 @@ public RDFAnalyzerTest(String resourceName, int expectedNumberOfTriples) { @Parameters public static Collection data() throws Exception { - return Arrays.asList(new Object[][] { { "rdf_analyzer/new_york/new_york_jsonld", 8603 }, - { "rdf_analyzer/new_york/new_york_n3", 8603 }, { "rdf_analyzer/new_york/new_york_rdf", 8603 }, - { "rdf_analyzer/new_york/new_york_rdfjson", 8603 }, { "rdf_analyzer/new_york/new_york_ttl", 8603 }, + return Arrays.asList(new Object[][] { + { "rdf_analyzer/new_york/new_york_jsonld", 8603 }, + { "rdf_analyzer/new_york/new_york_n3", 8603 }, + { "rdf_analyzer/new_york/new_york_rdf", 8603 }, + { "rdf_analyzer/new_york/new_york_rdfjson", 8603 }, + { "rdf_analyzer/new_york/new_york_ttl", 8603 }, { "rdf_analyzer/new_york/new_york_turtle", 8603 }, - { "rdf_analyzer/genders_en/genders_en_jsonld", 8408 }, { "rdf_analyzer/genders_en/genders_en_rdf", 8408 }, { "rdf_analyzer/genders_en/genders_en_rdfjson", 8408 }, - { "rdf_analyzer/genders_en/genders_en_tql", 8408 }, { "rdf_analyzer/genders_en/genders_en_ttl", 8408 }, - { "rdf_analyzer/genders_en/genders_en_turtle", 8408 } }); + { "rdf_analyzer/genders_en/genders_en_tql", 8408 }, + { "rdf_analyzer/genders_en/genders_en_ttl", 8408 }, + { "rdf_analyzer/genders_en/genders_en_turtle", 8408 }, + }); } @Test From ab374fc1ac13216faa539e342b51dfb5e157709f Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:27:26 +0100 Subject: [PATCH 040/102] Ignore .factorypath files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 91d057d6c..c130080fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ target *.log *.class .classpath +.factorypath .project .settings .idea From b7479e4cf5462790c1580170107946e4bc7c83fd Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:29:18 +0100 Subject: [PATCH 041/102] Add an assertion message --- .../dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 2cd3c2661..b8ddff5b2 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -76,7 +76,7 @@ public void test() throws URISyntaxException, IOException { // Analyze the file analyzer.analyze(curi, dataFile, sink); // Check the result and close the sink and collector - Assert.assertEquals(expectedNumberOfTriples, collector.getSize()); + Assert.assertEquals("Number of triples in " + resourceName, expectedNumberOfTriples, collector.getSize()); sink.closeSinkForUri(curi); collector.closeSinkForUri(curi); } From 5e81c37c60448b2bd714ec15e995e933065f68c7 Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:30:36 +0100 Subject: [PATCH 042/102] Test amount of failed parse attempts --- .../org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java | 3 +++ .../dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java | 2 ++ 2 files changed, 5 insertions(+) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index 27ffa59ad..f46f6634b 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -41,6 +41,8 @@ public class RDFAnalyzer extends AbstractAnalyzer { private List listLangs = new ArrayList(); private Set jenaContentTypes = new HashSet(); + protected long failedParseAttempts = 0; + public RDFAnalyzer(UriCollector collector) { super(collector); @@ -94,6 +96,7 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { break; } catch (Exception e) { LOGGER.warn("Could not parse file as " + l.getName()); + failedParseAttempts += 1; } } } diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index b8ddff5b2..1c93e477f 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -77,6 +77,8 @@ public void test() throws URISyntaxException, IOException { analyzer.analyze(curi, dataFile, sink); // Check the result and close the sink and collector Assert.assertEquals("Number of triples in " + resourceName, expectedNumberOfTriples, collector.getSize()); + Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 6", + ((RDFAnalyzer)analyzer).failedParseAttempts <= 6); sink.closeSinkForUri(curi); collector.closeSinkForUri(curi); } From 9f0b58e286271ab404edc4c75832002fe30be948 Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:43:04 +0100 Subject: [PATCH 043/102] Remove TTL as an alias for TURTLE --- .../org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index f46f6634b..d4a4907d1 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -54,7 +54,6 @@ public RDFAnalyzer(UriCollector collector) { listLangs.add(Lang.JSONLD); listLangs.add(Lang.TRIG); listLangs.add(Lang.TRIX); - listLangs.add(Lang.TTL); listLangs.add(Lang.TURTLE); for (Lang lang : RDFLanguages.getRegisteredLanguages()) { From 3e8e3a563ed1267b32f8ba4157d73b6318baa5f2 Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:44:50 +0100 Subject: [PATCH 044/102] Remove NT as a subset of NQUADS --- .../org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java | 1 - .../dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index d4a4907d1..fc0e7a8f5 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -46,7 +46,6 @@ public class RDFAnalyzer extends AbstractAnalyzer { public RDFAnalyzer(UriCollector collector) { super(collector); - listLangs.add(Lang.NT); listLangs.add(Lang.NQUADS); listLangs.add(Lang.RDFJSON); listLangs.add(Lang.RDFTHRIFT); diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 1c93e477f..14fccb694 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -77,8 +77,8 @@ public void test() throws URISyntaxException, IOException { analyzer.analyze(curi, dataFile, sink); // Check the result and close the sink and collector Assert.assertEquals("Number of triples in " + resourceName, expectedNumberOfTriples, collector.getSize()); - Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 6", - ((RDFAnalyzer)analyzer).failedParseAttempts <= 6); + Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 5", + ((RDFAnalyzer)analyzer).failedParseAttempts <= 5); sink.closeSinkForUri(curi); collector.closeSinkForUri(curi); } From 327d6ec58db1e3940452a776c0df935cfb94c027 Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Wed, 15 Jan 2020 03:14:11 +0100 Subject: [PATCH 045/102] Add an example TriG test file --- .../analyzer/impl/RDFAnalyzerTest.java | 1 + .../test/resources/rdf_analyzer/trig_example | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 squirrel.worker/src/test/resources/rdf_analyzer/trig_example diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 14fccb694..f7a80db52 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -51,6 +51,7 @@ public static Collection data() throws Exception { { "rdf_analyzer/genders_en/genders_en_tql", 8408 }, { "rdf_analyzer/genders_en/genders_en_ttl", 8408 }, { "rdf_analyzer/genders_en/genders_en_turtle", 8408 }, + { "rdf_analyzer/trig_example", 15 }, }); } diff --git a/squirrel.worker/src/test/resources/rdf_analyzer/trig_example b/squirrel.worker/src/test/resources/rdf_analyzer/trig_example new file mode 100644 index 000000000..e3508dc73 --- /dev/null +++ b/squirrel.worker/src/test/resources/rdf_analyzer/trig_example @@ -0,0 +1,22 @@ +@prefix rdf: . +@prefix dc: . +@prefix foaf: . + +# default graph - no {} used. + dc:publisher "Bob" . + dc:publisher "Alice" . + +# GRAPH keyword to highlight a named graph +# Abbreviation of triples using ; +GRAPH +{ + [] foaf:name "Bob" ; + foaf:mbox ; + foaf:knows _:b . +} + +GRAPH +{ + _:b foaf:name "Alice" ; + foaf:mbox +} From e64128db213caea5c9dc2c6da2bed5befcd89d5f Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Wed, 15 Jan 2020 03:15:15 +0100 Subject: [PATCH 046/102] Time RDFAnalyzer tests --- .../analyzer/impl/RDFAnalyzerTest.java | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index f7a80db52..67d8b8574 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -10,6 +10,7 @@ import java.net.URISyntaxException; import java.util.Arrays; import java.util.Collection; +import java.util.concurrent.TimeUnit; import org.apache.commons.io.IOUtils; import org.dice_research.squirrel.analyzer.Analyzer; @@ -19,14 +20,34 @@ import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; import org.dice_research.squirrel.sink.Sink; import org.dice_research.squirrel.sink.impl.mem.InMemorySink; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.Stopwatch; +import org.junit.runner.Description; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; @RunWith(Parameterized.class) public class RDFAnalyzerTest { + long startTime; + long endTime; + static long totalTime; + + @Rule + public Stopwatch stopwatch = new Stopwatch() { + @Override + protected void succeeded(long ns, Description desc) { + totalTime += (endTime - startTime); + } + }; + + @AfterClass + public static void afterClass() { + System.err.println(String.format("RDFAnalyzerTest %d", totalTime)); + } private String resourceName; private int expectedNumberOfTriples; @@ -48,7 +69,7 @@ public static Collection data() throws Exception { { "rdf_analyzer/genders_en/genders_en_jsonld", 8408 }, { "rdf_analyzer/genders_en/genders_en_rdf", 8408 }, { "rdf_analyzer/genders_en/genders_en_rdfjson", 8408 }, - { "rdf_analyzer/genders_en/genders_en_tql", 8408 }, + { "rdf_analyzer/genders_en/genders_en_tql", 8410 }, { "rdf_analyzer/genders_en/genders_en_ttl", 8408 }, { "rdf_analyzer/genders_en/genders_en_turtle", 8408 }, { "rdf_analyzer/trig_example", 15 }, @@ -74,8 +95,15 @@ public void test() throws URISyntaxException, IOException { // Open the sink and collector sink.openSinkForUri(curi); collector.openSinkForUri(curi); + + // Need to do this even if other things are moved to Before/After due how Stopwatch works. + startTime = stopwatch.runtime(TimeUnit.MILLISECONDS); + // Analyze the file analyzer.analyze(curi, dataFile, sink); + + endTime = stopwatch.runtime(TimeUnit.MILLISECONDS); + // Check the result and close the sink and collector Assert.assertEquals("Number of triples in " + resourceName, expectedNumberOfTriples, collector.getSize()); Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 5", From 8b7fdcf9d38ef2962bb4cecbac607697815f620f Mon Sep 17 00:00:00 2001 From: Micha Date: Sun, 5 Apr 2020 12:45:16 +0200 Subject: [PATCH 047/102] Cleaned up parent pom.xml. --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index a88909b56..33016377b 100644 --- a/pom.xml +++ b/pom.xml @@ -60,12 +60,12 @@ maven.aksw.internal AKSW Internal Release Repository - http://maven.aksw.org/repository/internal/ + https://maven.aksw.org/repository/internal/ maven.aksw.snapshots University Leipzig, AKSW Maven2 Repository - http://maven.aksw.org/repository/snapshots + https://maven.aksw.org/repository/snapshots spring-releases From 6d79c349818cc8c4a9b27e63d31103596641b8dc Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 17 Apr 2020 18:23:00 +0200 Subject: [PATCH 048/102] fixed queries for same predicate --- .../impl/html/scraper/HtmlScraper.java | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java index 8c9bff0fe..5b17736e8 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java @@ -19,7 +19,6 @@ import org.apache.jena.graph.NodeFactory; import org.apache.jena.graph.Triple; import org.apache.jena.rdf.model.ResourceFactory; -import org.apache.jena.tdb.store.Hash; import org.dice_research.squirrel.analyzer.impl.html.scraper.exceptions.SyntaxParserException; import org.dice_research.squirrel.data.uri.UriUtils; import org.jsoup.Jsoup; @@ -77,13 +76,15 @@ public List scrape(String uri, File filetToScrape) throws Exception { this.uri = uri; if (uri.contains("?")) { - this.label = uri.substring(uri.lastIndexOf("/") + 1, uri.lastIndexOf("?")); + String temp = uri.substring(0 , uri.indexOf("?")+1); + this.label = temp.substring(temp.lastIndexOf("/") + 1, temp.lastIndexOf("?")); } else { this.label = uri.substring(uri.lastIndexOf("/") + 1, uri.length()); } if ((boolean) yamlFile.getFile_descriptor().get(YamlFileAtributes.SEARCH_CHECK).get("ignore-request") && uri.contains("?")) { - label = uri.substring(uri.lastIndexOf("/") + 1, uri.lastIndexOf("?")); + String temp = uri.substring(0 , uri.indexOf("?")+1); + this.label = temp.substring(temp.lastIndexOf("/") + 1, temp.lastIndexOf("?")); this.uri = uri.substring(0, uri.indexOf("?")); } @@ -274,7 +275,39 @@ private Set scrapeTree(Map mapEntry, Set triples stackNode.push(node); triples.addAll(scrapeTree((Map) entry.getValue(), triples, stackNode)); - } else if (entry.getValue() instanceof String) { + }else if(entry.getValue() instanceof ArrayList) { + + List listValues = (ArrayList)entry.getValue(); + + Node p = ResourceFactory.createResource(entry.getKey()).asNode(); + + for(String v : listValues) { + + + List o = jsoupQuery(v); + if (o.isEmpty()) { + LOGGER.warn("Element " + entry.getKey() + ": " + v + " not found or does not exist"); + continue; + } + int i=0; + for (Node n : o) { + if(listIterableObjects.contains(stackNode.peek().toString())) { + Triple t = new Triple(NodeFactory.createURI(stackNode.peek().toString() + "_" + i), p, n); + updatedObjects.add(NodeFactory.createURI(stackNode.peek().toString() + "_" + i)); + triples.add(t); + i = i + 1; + + }else { + Triple t = new Triple(NodeFactory.createURI(stackNode.peek().toString()), p, n); + triples.add(t); + } + } + + + } + + + }else if (entry.getValue() instanceof String) { Node p = ResourceFactory.createResource(entry.getKey()).asNode(); From 0c8bd5688ccf03eb85a7253b483601642d0accfd Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 4 May 2020 11:00:39 +0200 Subject: [PATCH 049/102] uri depth impl --- .../org/dice_research/squirrel/Constants.java | 1 + .../squirrel/data/uri/UriSeedReader.java | 8 +- .../squirrel/queue/AbstractGroupingQueue.java | 13 + .../squirrel/queue/InMemoryQueue.java | 6 + .../squirrel/queue/UriQueue.java | 5 + .../components/FrontierComponent.java | 13 +- .../data/uri/filter/MongoDBKnowUriFilter.java | 367 +++++++++--------- .../domainbased/MongoDBDomainBasedQueue.java | 12 +- .../queue/ipbased/MongoDBIpBasedQueue.java | 22 +- .../frontier/impl/FrontierImplTest.java | 4 +- .../domainbased/MongoDBDomainQueueTest.java | 5 +- .../ipbased/MongoDBIpBasedQueueTest.java | 2 +- .../collect/SqlBasedUriCollector.java | 9 +- .../squirrel/collect/UriCollector.java | 1 + 14 files changed, 271 insertions(+), 197 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java index 1683d7641..195add74c 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java @@ -37,6 +37,7 @@ public class Constants { public static final String URI_CRAWLING_ACTIVITY_URI = "activity-uri"; public static final String URI_CRAWLING_ACTIVITY = "activity"; + public static final String URI_DEPTH = "depth"; public static final String URI_HASH_KEY = "HashValue"; public static final String UUID_KEY = "UUID"; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java index 724c0accc..9ae0dfb33 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java @@ -1,10 +1,12 @@ package org.dice_research.squirrel.data.uri; +import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -13,6 +15,7 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; +import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,12 +33,15 @@ public class UriSeedReader { private static final String URI = "uri"; private boolean isCsv = true; private Reader in; + String filePath = ""; + private static final Logger LOGGER = LoggerFactory.getLogger(UriSeedReader.class); public UriSeedReader(String seedFile) { + this.filePath = seedFile; Tika tika = new Tika(); String mimetype = tika.detect(seedFile); isCsv = "text/csv".equals(mimetype); @@ -76,7 +82,7 @@ public List getUris() throws IllegalArgumentException, IOExceptio } }else { - LOGGER.error("Seed file is not a CSV file"); + listUris = UriUtils.createCrawleableUriList(FileUtils.readLines(new File(filePath), StandardCharsets.UTF_8)); } return listUris; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/AbstractGroupingQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/AbstractGroupingQueue.java index cb602c1f1..8e84b3d11 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/AbstractGroupingQueue.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/AbstractGroupingQueue.java @@ -30,6 +30,11 @@ public abstract class AbstractGroupingQueue implements BlockingQueue { * Set of blocked key values. */ private Set blockedKeys = new HashSet(); + + /** + * if the queue will store the depth or not + */ + protected boolean includeDepth = false; /** * Constructor. @@ -134,5 +139,13 @@ public SimpleEntry> next() { * set of URIs which should be removed */ protected abstract void deleteUris(T groupKey, List uris); + + /** + * Returns if the queue is storing the crawled depth + * + */ + public boolean isDepthIncluded() { + return this.includeDepth; + } } diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java index 159cbca16..204f625ff 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java @@ -92,4 +92,10 @@ public int compare(InetAddress a1, InetAddress a2) { return 0; } + @Override + public boolean isDepthIncluded() { + // TODO Auto-generated method stub + return false; + } + } diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java index 8be207c7c..ff0b7baf4 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java @@ -47,4 +47,9 @@ public interface UriQueue { * Opens the queue and allocates necessary resources. */ public void open(); + + /** + * Check if the queue is storing the depth + */ + public boolean isDepthIncluded(); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 389bbf2bd..4eb9ea714 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -3,7 +3,6 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -11,7 +10,6 @@ import java.util.TimerTask; import java.util.concurrent.Semaphore; -import org.apache.commons.io.FileUtils; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.configurator.SeedConfiguration; @@ -19,12 +17,10 @@ import org.dice_research.squirrel.configurator.WhiteListConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriSeedReader; -import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; -import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.data.uri.serialize.Serializer; @@ -255,15 +251,18 @@ private void responseToUriSetRequest(ResponseHandler handler, String responseQue LOGGER.warn("Got a UriSetRequest object without a ResponseHandler. No response will be sent."); } } + + private List initializeDepth(List listUris){ + listUris.forEach(uri -> uri.addData(Constants.URI_DEPTH, 1)); + return listUris; + } protected void processSeedFile(String seedFile) { try { - List listSeeds = new UriSeedReader(seedFile).getUris(); + List listSeeds = initializeDepth(new UriSeedReader(seedFile).getUris()); if (!listSeeds.isEmpty()) frontier.addNewUris(listSeeds); - List lines = FileUtils.readLines(new File(seedFile), StandardCharsets.UTF_8); - frontier.addNewUris(UriUtils.createCrawleableUriList(lines)); } catch (Exception e) { LOGGER.error("Couldn't process seed file. It will be ignored.", e); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index f9a8e022c..6bce33717 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -13,6 +13,7 @@ import org.bson.Document; import org.bson.conversions.Bson; +import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriType; @@ -41,188 +42,208 @@ */ @SuppressWarnings("deprecation") -public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable,UriHashCustodian { - - private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); - - private MongoClient client; - private MongoDatabase mongoDB; - public static final String DB_NAME = "squirrel"; - private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds - public static final String COLLECTION_NAME = "knownurifilter"; - - public static final String COLUMN_TIMESTAMP_LAST_CRAWL = "timestampLastCrawl"; - public static final String COLUMN_URI = "uri"; - public static final String COLUMN_CRAWLING_IN_PROCESS = "crawlingInProcess"; - public static final String COLUMN_TIMESTAMP_NEXT_CRAWL = "timestampNextCrawl"; - public static final String COLUMN_IP = "ipAddress"; - public static final String COLUMN_TYPE = "type"; - public static final String COLUMN_HASH_VALUE = "hashValue"; - private static final boolean PERSIST = System.getenv("QUEUE_FILTER_PERSIST") == null ? false : Boolean.parseBoolean(System.getenv("QUEUE_FILTER_PERSIST")); - /** - * Used as a default hash value for URIS, will be replaced by real hash value as soon as it has been computed. - */ - private static final String DUMMY_HASH_VALUE = "dummyValue"; - - public MongoDBKnowUriFilter(String hostName, Integer port) { - - LOGGER.info("Filter Persistance: " + PERSIST); - - - MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - - if(mongoConfiguration != null &&(mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { +public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable, UriHashCustodian { + + private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); + + private MongoClient client; + private MongoDatabase mongoDB; + private int max_depth; + public static final String DB_NAME = "squirrel"; + private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds + public static final String COLLECTION_NAME = "knownurifilter"; + + public static final String COLUMN_TIMESTAMP_LAST_CRAWL = "timestampLastCrawl"; + public static final String COLUMN_URI = "uri"; + public static final String COLUMN_CRAWLING_IN_PROCESS = "crawlingInProcess"; + public static final String COLUMN_TIMESTAMP_NEXT_CRAWL = "timestampNextCrawl"; + public static final String COLUMN_IP = "ipAddress"; + public static final String COLUMN_TYPE = "type"; + public static final String COLUMN_HASH_VALUE = "hashValue"; + private static final boolean PERSIST = System.getenv("QUEUE_FILTER_PERSIST") == null ? false + : Boolean.parseBoolean(System.getenv("QUEUE_FILTER_PERSIST")); + /** + * Used as a default hash value for URIS, will be replaced by real hash value as + * soon as it has been computed. + */ + private static final String DUMMY_HASH_VALUE = "dummyValue"; + + public MongoDBKnowUriFilter(String hostName, Integer port) { + + LOGGER.info("Filter Persistance: " + PERSIST); + + MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); + MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null + && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); - + MongoClientOptions options = optionsBuilder.build(); - client = new MongoClient(new ServerAddress(hostName, port),options); - - }else { + client = new MongoClient(new ServerAddress(hostName, port), options); + + } else { client = new MongoClient(hostName, port); } - } - - @Override - public boolean isUriGood(CrawleableUri uri) { - MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) - .find(new Document("uri", uri.getUri().toString())).iterator(); - - if (cursor.hasNext()) { - LOGGER.debug("URI {} is not good", uri.toString()); - Document doc = cursor.next(); - Long timestampRetrieved = Long.parseLong(doc.get(COLUMN_TIMESTAMP_LAST_CRAWL).toString()); - cursor.close(); - if ((System.currentTimeMillis() - timestampRetrieved) < recrawlEveryWeek) { - return false; - } else { - return true; - } - } else { - LOGGER.debug("URI {} is good", uri.toString()); - cursor.close(); - return true; - } - - } - - @Override - public void add(CrawleableUri uri, long nextCrawlTimestamp) { - add(uri, System.currentTimeMillis(), nextCrawlTimestamp); - } - - public Document crawleableUriToMongoDocument(CrawleableUri uri) { - - UriType uriType = uri.getType(); - - return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); - - } - - @Override - public void close() throws IOException { - if(!PERSIST) { - mongoDB.getCollection(COLLECTION_NAME).drop(); - - } - client.close(); - } - - public void open() { - mongoDB = client.getDatabase(DB_NAME); - if (!knowUriTableExists()) { - mongoDB.createCollection(COLLECTION_NAME); - MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_NAME); - mongoCollection.createIndex(Indexes.compoundIndex(Indexes.ascending("uri"))); - } - } - - public boolean knowUriTableExists() { - for (String collection : mongoDB.listCollectionNames()) { - if (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) { - return true; - } else { - return false; - } - } - return false; - } - - @Override - public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp) { - mongoDB.getCollection(COLLECTION_NAME) - .insertOne(crawleableUriToMongoDocument(uri) - .append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) - .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) - .append(COLUMN_CRAWLING_IN_PROCESS, false) - .append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE) - ); - LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); - } - - @Override - public void addHashValuesForUris(List uris) { - - } - - - public void purge() { - mongoDB.getCollection(COLLECTION_NAME).drop(); - } - - @Override - public List getOutdatedUris() { - // get all uris with the following property: - // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - - long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, FrontierImpl.getGeneralRecrawlTime()); - - Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), - Filters.or( - Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), - Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3) - )); - - Iterator uriDocs = mongoDB.getCollection(COLLECTION_NAME).find(filter).iterator(); - - List urisToRecrawl = new ArrayList<>(); - while (uriDocs.hasNext()) { - try { - Document doc = uriDocs.next(); - String ipString = (String) doc.get(COLUMN_IP); - if (ipString.contains("/")) { - ipString = ipString.split("/")[1]; - } - urisToRecrawl.add(new CrawleableUri(new URI((String) doc.get(COLUMN_URI)), InetAddress.getByName(ipString))); - } catch (URISyntaxException | UnknownHostException e) { - LOGGER.warn(e.toString()); - } - } - - // mark that the uris are in process now - for (CrawleableUri uri : urisToRecrawl) { - - BasicDBObject newDocument = new BasicDBObject(); - newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); - - BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); - - mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); - - } + } + + public MongoDBKnowUriFilter(String hostName, Integer port, int max_depth) { + this(hostName, port); + this.max_depth = max_depth; + } + + @Override + public boolean isUriGood(CrawleableUri uri) { + MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) + .find(new Document("uri", uri.getUri().toString())).iterator(); + + if (cursor.hasNext()) { + LOGGER.debug("URI {} is not good", uri.toString()); + Document doc = cursor.next(); + Long timestampRetrieved = Long.parseLong(doc.get(COLUMN_TIMESTAMP_LAST_CRAWL).toString()); + cursor.close(); + if ((System.currentTimeMillis() - timestampRetrieved) < recrawlEveryWeek) { + return false; + } else { + return true; + } + } else { + + if (uri.getData().containsKey(Constants.URI_DEPTH)) { + int depth = Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()); + if (depth > max_depth) { + LOGGER.debug("Max Depth reached. Uri {} is not good", uri.toString()); + cursor.close(); + return false; + } + + } + + LOGGER.debug("URI {} is good", uri.toString()); + cursor.close(); + return true; + } + + } + + @Override + public void add(CrawleableUri uri, long nextCrawlTimestamp) { + add(uri, System.currentTimeMillis(), nextCrawlTimestamp); + } + + public Document crawleableUriToMongoDocument(CrawleableUri uri) { + + UriType uriType = uri.getType(); + + if (uri.getData().containsKey(Constants.URI_DEPTH)) + return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()).append("depth", + Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString())); + + return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); + + } + + @Override + public void close() throws IOException { + if (!PERSIST) { + mongoDB.getCollection(COLLECTION_NAME).drop(); + + } + client.close(); + } + + public void open() { + mongoDB = client.getDatabase(DB_NAME); + if (!knowUriTableExists()) { + mongoDB.createCollection(COLLECTION_NAME); + MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_NAME); + mongoCollection.createIndex(Indexes.compoundIndex(Indexes.ascending("uri"))); + } + } + + public boolean knowUriTableExists() { + for (String collection : mongoDB.listCollectionNames()) { + if (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) { + return true; + } else { + return false; + } + } + return false; + } + + @Override + public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp) { + mongoDB.getCollection(COLLECTION_NAME) + .insertOne(crawleableUriToMongoDocument(uri).append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) + .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) + .append(COLUMN_CRAWLING_IN_PROCESS, false).append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE)); + LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); + } + + @Override + public void addHashValuesForUris(List uris) { + + } + + public void purge() { + mongoDB.getCollection(COLLECTION_NAME).drop(); + } + + @Override + public List getOutdatedUris() { + // get all uris with the following property: + // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR + // lastCrawlTimestamp is 3 times older than generalRecrawlTime) + + long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, + FrontierImpl.getGeneralRecrawlTime()); + + Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), Filters.or( + Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), + Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3))); + + Iterator uriDocs = mongoDB.getCollection(COLLECTION_NAME).find(filter).iterator(); + + List urisToRecrawl = new ArrayList<>(); + while (uriDocs.hasNext()) { + try { + Document doc = uriDocs.next(); + String ipString = (String) doc.get(COLUMN_IP); + if (ipString.contains("/")) { + ipString = ipString.split("/")[1]; + } + urisToRecrawl + .add(new CrawleableUri(new URI((String) doc.get(COLUMN_URI)), InetAddress.getByName(ipString))); + } catch (URISyntaxException | UnknownHostException e) { + LOGGER.warn(e.toString()); + } + } + + // mark that the uris are in process now + for (CrawleableUri uri : urisToRecrawl) { + + BasicDBObject newDocument = new BasicDBObject(); + newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); + + BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); + + mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); + + } // cursor.close(); - return urisToRecrawl; - } - - @Override - public long count() { - // TODO Auto-generated method stub - return 0; - } + return urisToRecrawl; + } + + @Override + public long count() { + // TODO Auto-generated method stub + return 0; + } @Override public Set getUrisWithSameHashValues(Set hashValuesForComparison) { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java index d852cc7c3..4df08ee6f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java @@ -12,6 +12,7 @@ import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; import org.dice_research.squirrel.queue.AbstractDomainBasedQueue; +import org.rdfhdt.hdt.util.Histogram; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,8 +47,12 @@ public class MongoDBDomainBasedQueue extends AbstractDomainBasedQueue { private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBDomainBasedQueue.class); - public MongoDBDomainBasedQueue(String hostName, Integer port, Serializer serializer) { + public MongoDBDomainBasedQueue(String hostName, Integer port, Serializer serializer, boolean includeDepth) { this.serializer = serializer; + + this.includeDepth = includeDepth; + if(this.includeDepth) + LOGGER.info("Depth Persistance Enabled."); LOGGER.info("Queue Persistance: " + PERSIST); @@ -70,9 +75,8 @@ public MongoDBDomainBasedQueue(String hostName, Integer port, Serializer seriali } - public MongoDBDomainBasedQueue(String hostName, Integer port) { - client = new MongoClient(hostName, port); - serializer = new SnappyJavaUriSerializer(); + public MongoDBDomainBasedQueue(String hostName, Integer port,boolean includeDepth) { + this(hostName,port, new SnappyJavaUriSerializer(),includeDepth); } public void purge() { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java index 0800141f6..816c685cc 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java @@ -10,6 +10,7 @@ import org.bson.Document; import org.bson.types.Binary; +import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.serialize.Serializer; @@ -48,11 +49,20 @@ public class MongoDBIpBasedQueue extends AbstractIpAddressBasedQueue { : Boolean.parseBoolean(System.getenv("QUEUE_FILTER_PERSIST")); private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBIpBasedQueue.class); + + public MongoDBIpBasedQueue(String hostName, Integer port, boolean includeDepth) { + this(hostName,port,new SnappyJavaUriSerializer(), includeDepth); - public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer) { + } - LOGGER.info("Queue Persistance: " + PERSIST); + public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer, boolean includeDepth) { + LOGGER.info("Queue Persistance: " + PERSIST); + + this.includeDepth = includeDepth; + if(this.includeDepth) + LOGGER.info("Depth Persistance Enabled."); + this.serializer = serializer; MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); @@ -74,10 +84,7 @@ public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer) } - public MongoDBIpBasedQueue(String hostName, Integer port) { - client = new MongoClient(hostName, port); - serializer = new SnappyJavaUriSerializer(); - } + public void purge() { mongoDB.getCollection(COLLECTION_QUEUE).drop(); @@ -225,6 +232,9 @@ public Document getUriDocument(CrawleableUri uri) { docUri.put("_id", uri.getUri().hashCode()); docUri.put("ipAddress", ipAddress.getHostAddress()); docUri.put("type", DEFAULT_TYPE); + if(includeDepth) + docUri.put("depth",uri.getData(Constants.URI_DEPTH)); + docUri.put("uri", new Binary(suri)); return docUri; } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 494c31adc..d858d9049 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -17,10 +17,8 @@ import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; -import org.dice_research.squirrel.data.uri.norm.DomainBasedUriGenerator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; -import org.dice_research.squirrel.data.uri.norm.WellKnownPathUriGenerator; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; @@ -45,7 +43,7 @@ public void setUp() throws Exception { MongoDBBasedTest.setUpMDB(); filter = new MongoDBKnowUriFilter("localhost", 58027); - queue = new MongoDBIpBasedQueue("localhost", 58027); + queue = new MongoDBIpBasedQueue("localhost", 58027,false); filter.open(); queue.open(); List uriGenerators = new ArrayList(); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java index 3b4a7aa13..c3d7a816c 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java @@ -30,7 +30,7 @@ public class MongoDBDomainQueueTest extends MongoDBBasedTest { @Before public void setUp() throws Exception { - mongodbQueue = new MongoDBDomainBasedQueue("localhost", 58027); + mongodbQueue = new MongoDBDomainBasedQueue("localhost", 58027,false); CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); uris.add(cuf.create(new URI("http://localhost/sparql"), InetAddress.getByName("127.0.0.1"), UriType.SPARQL)); @@ -121,6 +121,8 @@ public void getUris() throws Exception { for (CrawleableUri uri : uris) { mongodbQueue.addUri(uri); } + + List listUris = mongodbQueue.getNextUris(); Iterator iter = mongodbQueue.getGroupIterator(); int count = 0; while (iter.hasNext()) { @@ -131,6 +133,7 @@ public void getUris() throws Exception { ++count; } } + assertEquals(uris.size(), count); mongodbQueue.close(); } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueueTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueueTest.java index 32c975c2a..75403bca1 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueueTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueueTest.java @@ -29,7 +29,7 @@ public class MongoDBIpBasedQueueTest extends MongoDBBasedTest{ @Before public void setUp() throws Exception { - mongodbQueue = new MongoDBIpBasedQueue("localhost", 58027); + mongodbQueue = new MongoDBIpBasedQueue("localhost", 58027,false); // mongodbQueue = new MongoDBQueue("localhost", 27017); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/SqlBasedUriCollector.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/SqlBasedUriCollector.java index bce6eddeb..e0d6d2082 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/SqlBasedUriCollector.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/SqlBasedUriCollector.java @@ -22,6 +22,7 @@ import org.apache.http.annotation.NotThreadSafe; import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; +import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.iterators.SqlBasedIterator; @@ -39,7 +40,7 @@ * stateless. */ @NotThreadSafe -public class SqlBasedUriCollector implements UriCollector, Closeable { +public class SqlBasedUriCollector implements UriCollector,Closeable { private static final Logger LOGGER = LoggerFactory.getLogger(SqlBasedUriCollector.class); @@ -161,6 +162,12 @@ protected void addUri(CrawleableUri uri, Node node) { @Override public void addNewUri(CrawleableUri uri, CrawleableUri newUri) { + + if (uri.getData(Constants.URI_DEPTH) != null) { + int nextDepth = Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()) + 1; + newUri.addData(Constants.URI_DEPTH, nextDepth); + } + String uriString = uri.getUri().toString(); if (knownUris.containsKey(uriString)) { UriTableStatus table = knownUris.get(uriString); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java index 05fa7a9b7..a54b71d6d 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java @@ -5,6 +5,7 @@ import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; +import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.sink.SinkBase; From ef19fad02d8e3cc6cdf33d111bef966b5f394054 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 4 May 2020 11:41:51 +0200 Subject: [PATCH 050/102] included depth information on metadata --- .../src/main/java/org/dice_research/squirrel/vocab/Squirrel.java | 1 + .../org/dice_research/squirrel/metadata/CrawlingActivity.java | 1 + 2 files changed, 2 insertions(+) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java index 32512cf70..4810edfa2 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java @@ -33,6 +33,7 @@ protected static final Property property(String local) { public static final Resource ResultFile = resource("ResultFile"); public static final Property approxNumberOfTriples = property("approxNumberOfTriples"); + public static final Property depth = property("depth"); public static final Property crawled = property("crawled"); public static final Property uriHostedOn = property("uriHostedOn"); public static final Property status = property("status"); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java index 254fe4f1d..e87387db7 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java @@ -123,6 +123,7 @@ public Model prepareMetadataModel() { model.add(activity, PROV_O.startedAtTime, model.createTypedLiteral(dateStarted)); model.add(activity, PROV_O.endedAtTime, model.createTypedLiteral(dateEnded)); model.add(activity, Squirrel.approxNumberOfTriples, model.createTypedLiteral(numberOfTriples)); + model.add(activity, Squirrel.depth, model.createTypedLiteral(Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()))); Resource association = model.createResource(activityUri + "_workerAssoc"); model.add(association, RDF.type, PROV_O.Association); From 0cca9312836b5fe797f106206411e61ef23140a0 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 4 May 2020 11:43:00 +0200 Subject: [PATCH 051/102] included filter and queue constructors --- spring-config/frontier-context.xml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 1fad2092d..3f5b2df30 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -63,6 +63,7 @@ + + + + - + From da65f142cae6e807cf19fa6228054818f5f544ae Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 4 May 2020 12:29:34 +0200 Subject: [PATCH 052/102] fixed test --- spring-config/frontier-context.xml | 4 ++-- .../dice_research/squirrel/configurator/Configuration.java | 6 +++--- .../squirrel/queue/ipbased/MongoDBIpBasedQueue.java | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 3f5b2df30..ec9a45c51 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -79,10 +79,10 @@ - + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java b/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java index b2e241087..5e3ff4697 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java @@ -21,7 +21,7 @@ public static int getEnvInteger(String envVariableName, Logger logger) { try { return Integer.parseInt(toInt); } catch (Exception e) { - logger.error(envVariableName + " not found.", e); + logger.error(envVariableName + " not found."); return 0; } } @@ -31,7 +31,7 @@ public static boolean getEnvBoolean(String envVariableName, Logger logger) { try { return Boolean.parseBoolean(toBool); } catch (Exception e) { - logger.error(envVariableName + " not found.", e); + logger.error(envVariableName + " not found."); return false; } } @@ -41,7 +41,7 @@ public static long getEnvLong(String envVariableName, Logger logger) { try { return toLong != null ? Long.parseLong(toLong) : 0L; } catch (Exception e) { - logger.error(envVariableName + " not found.", e); + logger.error(envVariableName + " not found."); return 0L; } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java index 816c685cc..c3fedf869 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java @@ -68,8 +68,8 @@ public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer, MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - if (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null - && mongoConfiguration.getServerTimeout() != null) { + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null + && mongoConfiguration.getServerTimeout() != null)) { optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); From 6aa5e6ae411e2cf2a77525f57d62b1678b21ffda Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 4 May 2020 12:41:18 +0200 Subject: [PATCH 053/102] fixed test --- .../squirrel/queue/domainbased/MongoDBDomainBasedQueue.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java index 4df08ee6f..82df936a4 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java @@ -59,8 +59,8 @@ public MongoDBDomainBasedQueue(String hostName, Integer port, Serializer seriali MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - if (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null - && mongoConfiguration.getServerTimeout() != null) { + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null + && mongoConfiguration.getServerTimeout() != null)) { optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); From 57051d55c73f4f0a41f592bd9f5cfdc2e9b90c5c Mon Sep 17 00:00:00 2001 From: Denis Kuchelev Date: Mon, 16 Dec 2019 04:58:17 +0100 Subject: [PATCH 054/102] Use rdfdetector to determine possible serializations --- squirrel.worker/pom.xml | 6 ++++ .../squirrel/analyzer/impl/RDFAnalyzer.java | 34 ++++++++++++------- .../analyzer/impl/RDFAnalyzerTest.java | 6 ++-- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/squirrel.worker/pom.xml b/squirrel.worker/pom.xml index 71000042e..36cc12a2d 100644 --- a/squirrel.worker/pom.xml +++ b/squirrel.worker/pom.xml @@ -22,6 +22,12 @@ httpclient-cache --> + + org.dice-research + rdfdetector + 1.0.0 + + eu.trentorise.opendata jackan diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index fc0e7a8f5..5e383a132 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -1,12 +1,13 @@ package org.dice_research.squirrel.analyzer.impl; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.util.ArrayList; +import java.nio.file.Files; +import java.util.Collection; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Set; import org.apache.jena.riot.Lang; @@ -15,6 +16,7 @@ import org.apache.jena.riot.system.StreamRDF; import org.apache.tika.Tika; import org.apache.tika.io.IOUtils; +import org.dice_research.rdfdetector.RdfSerializationDetector; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.analyzer.AbstractAnalyzer; import org.dice_research.squirrel.analyzer.commons.FilterSinkRDF; @@ -38,22 +40,14 @@ public class RDFAnalyzer extends AbstractAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(RDFAnalyzer.class); - private List listLangs = new ArrayList(); private Set jenaContentTypes = new HashSet(); + private RdfSerializationDetector serializationDetector = new RdfSerializationDetector(50); protected long failedParseAttempts = 0; public RDFAnalyzer(UriCollector collector) { super(collector); - listLangs.add(Lang.NQUADS); - listLangs.add(Lang.RDFJSON); - listLangs.add(Lang.RDFTHRIFT); - listLangs.add(Lang.RDFXML); - listLangs.add(Lang.JSONLD); - listLangs.add(Lang.TRIG); - listLangs.add(Lang.TRIX); - listLangs.add(Lang.TURTLE); for (Lang lang : RDFLanguages.getRegisteredLanguages()) { if (!RDFLanguages.RDFNULL.equals(lang)) { @@ -88,13 +82,27 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { } } else { LOGGER.info("Content Type is null"); - for (Lang l : listLangs) { + + BufferedInputStream dataStream = new BufferedInputStream(Files.newInputStream(data.toPath())); + Collection langs = serializationDetector.detect(dataStream); + LOGGER.info("Detected languages: {}", langs); + for (Lang l : langs) { try { - RDFDataMgr.parse(filtered, data.getAbsolutePath(), l); + if (dataStream != null) { + // Reuse dataStream for the first attempted language. + RDFDataMgr.parse(filtered, dataStream, l); + } else { + RDFDataMgr.parse(filtered, data.getAbsolutePath(), l); + } break; } catch (Exception e) { LOGGER.warn("Could not parse file as " + l.getName()); failedParseAttempts += 1; + } finally { + if (dataStream != null) { + dataStream.close(); + dataStream = null; + } } } } diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 67d8b8574..70eaab0a0 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -46,7 +46,7 @@ protected void succeeded(long ns, Description desc) { @AfterClass public static void afterClass() { - System.err.println(String.format("RDFAnalyzerTest %d", totalTime)); + System.err.println(String.format("RDFAnalyzerTest total time: %d", totalTime)); } private String resourceName; @@ -106,8 +106,8 @@ public void test() throws URISyntaxException, IOException { // Check the result and close the sink and collector Assert.assertEquals("Number of triples in " + resourceName, expectedNumberOfTriples, collector.getSize()); - Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 5", - ((RDFAnalyzer)analyzer).failedParseAttempts <= 5); + Assert.assertTrue("Failed parse attempts for " + resourceName + ": " + ((RDFAnalyzer)analyzer).failedParseAttempts + " <= 2", + ((RDFAnalyzer)analyzer).failedParseAttempts <= 2); sink.closeSinkForUri(curi); collector.closeSinkForUri(curi); } From 27a97583124a03c211ee725be3835be9db0e1e6b Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 5 May 2020 17:50:52 +0200 Subject: [PATCH 055/102] returned exception stack trace --- .../dice_research/squirrel/configurator/Configuration.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java b/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java index 5e3ff4697..cffa30034 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/configurator/Configuration.java @@ -21,7 +21,7 @@ public static int getEnvInteger(String envVariableName, Logger logger) { try { return Integer.parseInt(toInt); } catch (Exception e) { - logger.error(envVariableName + " not found."); + logger.error(envVariableName + " not found.",e); return 0; } } @@ -31,7 +31,7 @@ public static boolean getEnvBoolean(String envVariableName, Logger logger) { try { return Boolean.parseBoolean(toBool); } catch (Exception e) { - logger.error(envVariableName + " not found."); + logger.error(envVariableName + " not found.",e); return false; } } @@ -41,7 +41,7 @@ public static long getEnvLong(String envVariableName, Logger logger) { try { return toLong != null ? Long.parseLong(toLong) : 0L; } catch (Exception e) { - logger.error(envVariableName + " not found."); + logger.error(envVariableName + " not found.",e); return 0L; } } From 66e66dd5fbf04fc61d7d821d13cdfcc48fabb55f Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 5 May 2020 17:51:43 +0200 Subject: [PATCH 056/102] removed unused variable --- .../java/org/dice_research/squirrel/data/uri/UriSeedReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java index 9ae0dfb33..a8eec0b3d 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java @@ -33,7 +33,6 @@ public class UriSeedReader { private static final String URI = "uri"; private boolean isCsv = true; private Reader in; - String filePath = ""; private static final Logger LOGGER = LoggerFactory.getLogger(UriSeedReader.class); From a9b2286f6e862ef33b6d78823b260cd07e0a55c1 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 5 May 2020 17:52:39 +0200 Subject: [PATCH 057/102] visibility of attribute --- .../java/org/dice_research/squirrel/data/uri/UriSeedReader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java index a8eec0b3d..40f9d927f 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/UriSeedReader.java @@ -33,6 +33,7 @@ public class UriSeedReader { private static final String URI = "uri"; private boolean isCsv = true; private Reader in; + private String filePath = ""; private static final Logger LOGGER = LoggerFactory.getLogger(UriSeedReader.class); From a6782e2e8aadd9ec7990bb437f18631137e8cee9 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 5 May 2020 17:54:34 +0200 Subject: [PATCH 058/102] removed unused method --- .../main/java/org/dice_research/squirrel/queue/UriQueue.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java index ff0b7baf4..cea142ae5 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/UriQueue.java @@ -48,8 +48,4 @@ public interface UriQueue { */ public void open(); - /** - * Check if the queue is storing the depth - */ - public boolean isDepthIncluded(); } From 8550ca2b1bd9b0948e82ec63d60a9a0d84a03a82 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 5 May 2020 18:08:41 +0200 Subject: [PATCH 059/102] checks if the key exists --- .../org/dice_research/squirrel/metadata/CrawlingActivity.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java index e87387db7..5e405329e 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/metadata/CrawlingActivity.java @@ -123,7 +123,8 @@ public Model prepareMetadataModel() { model.add(activity, PROV_O.startedAtTime, model.createTypedLiteral(dateStarted)); model.add(activity, PROV_O.endedAtTime, model.createTypedLiteral(dateEnded)); model.add(activity, Squirrel.approxNumberOfTriples, model.createTypedLiteral(numberOfTriples)); - model.add(activity, Squirrel.depth, model.createTypedLiteral(Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()))); + if(uri.getData().containsKey(Constants.URI_DEPTH)) + model.add(activity, Squirrel.depth, model.createTypedLiteral(Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()))); Resource association = model.createResource(activityUri + "_workerAssoc"); model.add(association, RDF.type, PROV_O.Association); From 9f99ed2ac672388b35858c0c474e43987fbbebc7 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 8 May 2020 15:44:20 +0200 Subject: [PATCH 060/102] included RelationalUriFilter interface and AndConcatenatingUriFilter --- spring-config/frontier-context.xml | 11 +++- .../data/uri/filter/KnownUriFilter.java | 4 ++ .../squirrel/data/uri/filter/UriFilter.java | 9 +++ .../relational/AndConcatenatingUriFilter.java | 57 +++++++++++++++++ .../relational/RelationalUriFilter.java | 12 ++++ .../components/FrontierComponent.java | 20 +++--- .../squirrel/data/uri/filter/DepthFilter.java | 62 +++++++++++++++++++ .../data/uri/filter/MongoDBKnowUriFilter.java | 18 ------ .../data/uri/filter/SchemeBasedUriFilter.java | 6 ++ .../frontier/impl/ExtendedFrontierImpl.java | 14 ++--- .../squirrel/frontier/impl/FrontierImpl.java | 41 ++++++------ .../impl/FrontierSenderToWebservice.java | 9 +-- .../frontier/impl/FrontierImplTest.java | 8 ++- .../impl/CkanSeedGeneratorImplTest.java | 6 +- 14 files changed, 215 insertions(+), 62 deletions(-) create mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java create mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index ec9a45c51..85e42ca75 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -65,6 +65,16 @@ + + + + + + + + + @@ -73,7 +83,6 @@ - diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java index dc4aaadd8..1fdb7df05 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/KnownUriFilter.java @@ -28,6 +28,10 @@ public interface KnownUriFilter extends UriFilter { * @param nextCrawlTimestamp The time at which the given URI should be crawled next. */ void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp); + + public default void add(CrawleableUri uri) { + add(uri, System.currentTimeMillis()); + } /** * Returns all {@link CrawleableUri}s which have to be recrawled. This means their time to next crawl has passed. diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilter.java index b89add778..f821f43b1 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilter.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilter.java @@ -21,4 +21,13 @@ public interface UriFilter { * requirements imposed by this filter. Otherwise false is returned. */ public boolean isUriGood(CrawleableUri uri); + + + /** + * Adds the given URI to the list of already known URIs. Works like calling {@link #add(CrawleableUri, long)} with the current system time. + * + * @param uri the URI that should be added to the list. + * + */ + public void add(CrawleableUri uri); } diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java new file mode 100644 index 000000000..255859e73 --- /dev/null +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java @@ -0,0 +1,57 @@ +package org.dice_research.squirrel.data.uri.filter.relational; + +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilter; + + +/** + * + * Relational Uri Filter for the AND operator + * + * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) + * + */ + +public class AndConcatenatingUriFilter implements RelationalUriFilter{ + + private KnownUriFilter knownUriFilter; + private UriFilter uriFilter; + + public AndConcatenatingUriFilter(KnownUriFilter knownUriFilter, UriFilter uriFilter) { + this.knownUriFilter = knownUriFilter; + this.uriFilter = uriFilter; + } + + public AndConcatenatingUriFilter(KnownUriFilter knownUriFilter) { + this.knownUriFilter = knownUriFilter; + } + + @Override + public boolean isUriGood(CrawleableUri uri) { + + if(uriFilter != null) + return knownUriFilter.isUriGood(uri) && uriFilter.isUriGood(uri); + else + return knownUriFilter.isUriGood(uri); + } + + @Override + public void add(CrawleableUri uri) { + knownUriFilter.add(uri); + uriFilter.add(uri); + } + + @Override + public KnownUriFilter getKnownUriFilter() { + return this.knownUriFilter; + } + + @Override + public void setKnownUriFilter(KnownUriFilter knownUriFilter) { + this.knownUriFilter = knownUriFilter; + } + + + +} diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java new file mode 100644 index 000000000..b5eb03feb --- /dev/null +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java @@ -0,0 +1,12 @@ +package org.dice_research.squirrel.data.uri.filter.relational; + +import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilter; + +public interface RelationalUriFilter extends UriFilter { + + public KnownUriFilter getKnownUriFilter(); + + public void setKnownUriFilter(KnownUriFilter knownUriFilter); + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 4eb9ea714..0d9c4b5a4 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -20,6 +20,8 @@ import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -61,9 +63,9 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("queueBean") @Autowired protected UriQueue queue; - @Qualifier("knowUriFilterBean") + @Qualifier("UriFilterBean") @Autowired - private KnownUriFilter knownUriFilter; + private RelationalUriFilter uriFilter; private URIReferences uriReferences = null; private Frontier frontier; private RabbitQueue rabbitQueue; @@ -95,12 +97,12 @@ public void init() throws Exception { if (mongoConfiguration != null) { queue.open(); - knownUriFilter.open(); + uriFilter.getKnownUriFilter().open(); WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); if (whiteListConfiguration != null) { File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); - knownUriFilter = RegexBasedWhiteListFilter.create(knownUriFilter, whitelistFile); + uriFilter.setKnownUriFilter(RegexBasedWhiteListFilter.create(uriFilter.getKnownUriFilter(), whitelistFile)); } // TODO Reactivate me but with a different configuration @@ -111,11 +113,11 @@ public void init() throws Exception { } else { LOGGER.warn("Couldn't get MDBConfiguration. An in-memory queue will be used."); queue = new InMemoryQueue(); - knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); + uriFilter.setKnownUriFilter(new InMemoryKnownUriFilter(doRecrawling, recrawlingTime)); } // Build frontier - frontier = new ExtendedFrontierImpl(normalizer, knownUriFilter, uriReferences, queue,uriGenerator, doRecrawling); + frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); @@ -131,7 +133,7 @@ public void init() throws Exception { if (webConfiguration.isCommunicationWithWebserviceEnabled()) { final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, - workerGuard, queue, knownUriFilter, uriReferences); + workerGuard, queue, uriFilter, uriReferences); LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " + webConfiguration.isVisualizationOfCrawledGraphEnabled()); Thread senderThread = new Thread(sender); @@ -166,8 +168,8 @@ public void close() throws IOException { queue.close(); if (uriReferences != null) uriReferences.close(); - if (knownUriFilter instanceof Closeable) { - ((Closeable) knownUriFilter).close(); + if (uriFilter instanceof Closeable) { + ((Closeable) uriFilter).close(); } workerGuard.shutdown(); if (frontier != null) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java new file mode 100644 index 000000000..4e145231a --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java @@ -0,0 +1,62 @@ +package org.dice_research.squirrel.data.uri.filter; + +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Depth Filter implementation + * + * * @author Geraldo Souza Junior (gsjunior@mail.uni-paderborn.de) + * + */ + +public class DepthFilter implements UriFilter { + + private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); + + + private int max_depth; + + + + + public DepthFilter(int max_depth) { + this.max_depth = max_depth; + } + + @Override + public boolean isUriGood(CrawleableUri uri) { + + if (uri.getData().containsKey(Constants.URI_DEPTH)) { + int depth = Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()); + if (depth > max_depth) { + LOGGER.debug("Max Depth reached. Uri {} is not good", uri.toString()); + return false; + }else { + LOGGER.debug("URI {} is good", uri.toString()); + return true; + } + + } else { + LOGGER.debug("Depth depth is not being stored for Uri :{} . Please check the queue parameters.", uri.toString()); + return false; + } + + + } + + public void purge() { + + } + + @Override + public void add(CrawleableUri uri) { + // TODO Auto-generated method stub + + } + + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 6bce33717..cf11dfee9 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -90,11 +90,6 @@ public MongoDBKnowUriFilter(String hostName, Integer port) { } } - public MongoDBKnowUriFilter(String hostName, Integer port, int max_depth) { - this(hostName, port); - this.max_depth = max_depth; - } - @Override public boolean isUriGood(CrawleableUri uri) { MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) @@ -112,16 +107,6 @@ public boolean isUriGood(CrawleableUri uri) { } } else { - if (uri.getData().containsKey(Constants.URI_DEPTH)) { - int depth = Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString()); - if (depth > max_depth) { - LOGGER.debug("Max Depth reached. Uri {} is not good", uri.toString()); - cursor.close(); - return false; - } - - } - LOGGER.debug("URI {} is good", uri.toString()); cursor.close(); return true; @@ -138,9 +123,6 @@ public Document crawleableUriToMongoDocument(CrawleableUri uri) { UriType uriType = uri.getType(); - if (uri.getData().containsKey(Constants.URI_DEPTH)) - return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()).append("depth", - Integer.parseInt(uri.getData(Constants.URI_DEPTH).toString())); return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/SchemeBasedUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/SchemeBasedUriFilter.java index efe34011f..6ff588023 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/SchemeBasedUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/SchemeBasedUriFilter.java @@ -56,4 +56,10 @@ public void setSchemes(Set schemes) { this.schemes = schemes; } + @Override + public void add(CrawleableUri uri) { + // TODO Auto-generated method stub + + } + } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index 6ba330eaf..ff27edcd9 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -6,8 +6,8 @@ import java.util.Set; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -32,9 +32,9 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti * @param uriHashCustodian used to access and write hash values for uris. */ @SuppressWarnings("unused") - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, + public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - super(normalizer, knownUriFilter, queue, uriGenerators,doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian); + super(normalizer, relationalUriFilter, queue, uriGenerators,doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian); } /** @@ -47,8 +47,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, List uriGenerators, boolean doesRecrawling) { - super(normalizer, knownUriFilter, queue, uriGenerators, doesRecrawling); + public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, IpAddressBasedQueue queue, List uriGenerators, boolean doesRecrawling) { + super(normalizer, relationalUriFilter, queue, uriGenerators, doesRecrawling); } /** @@ -62,8 +62,8 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { - super(normalizer, knownUriFilter, uriReferences, queue,uriGenerators, doesRecrawling); + public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { + super(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, doesRecrawling); } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index c2a6a46b7..aa046aae0 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -10,6 +10,7 @@ import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -25,7 +26,7 @@ /** * Standard implementation of the {@link Frontier} interface containing a - * {@link #queue} and a {@link #knownUriFilter}. + * {@link #queue} and a {@link #relationalUriFilter}. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) */ @@ -41,7 +42,7 @@ public class FrontierImpl implements Frontier { /** * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ - protected KnownUriFilter knownUriFilter; + protected RelationalUriFilter relationalUriFilter; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to @@ -124,9 +125,9 @@ public class FrontierImpl implements Frontier { * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, List uriGenerators, + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, knownUriFilter, null, queue,uriGenerators, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); + this(normalizer, relationalUriFilter, null, queue,uriGenerators, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); } /** @@ -148,9 +149,9 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - this(normalizer, knownUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, timerPeriod); + this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, timerPeriod); } /** @@ -169,9 +170,9 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param doesRecrawling * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { - this(normalizer, knownUriFilter, uriReferences, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + this(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); } @@ -189,9 +190,9 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI * @param doesRecrawling * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,List uriGenerators, + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling) { - this(normalizer, knownUriFilter, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + this(normalizer, relationalUriFilter, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); } @@ -207,8 +208,8 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param queue * {@link UriQueue} used to manage the URIs that should be crawled. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,List uriGenerators) { - this(normalizer, knownUriFilter, queue,uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators) { + this(normalizer, relationalUriFilter, queue,uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); } /** @@ -234,11 +235,11 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, + public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { this.normalizer = normalizer; - this.knownUriFilter = knownUriFilter; + this.relationalUriFilter = relationalUriFilter; this.uriReferences = uriReferences; this.uriGenerator = uriGenerators; this.queue = queue; @@ -255,7 +256,7 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = knownUriFilter.getOutdatedUris(); + List urisToRecrawl = relationalUriFilter.getKnownUriFilter().getOutdatedUris(); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } }, this.timerPeriod, this.timerPeriod); @@ -299,8 +300,8 @@ public void addNewUri(CrawleableUri uri) { } protected void addNormalizedUri(CrawleableUri uri){ - if (knownUriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); + if (relationalUriFilter.isUriGood(uri)) { + LOGGER.debug("addNewUri(" + uri + "): URI is good [" + relationalUriFilter + "]"); if (schemeUriFilter.isUriGood(uri)) { LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); // Make sure that the IP is known @@ -315,14 +316,14 @@ protected void addNormalizedUri(CrawleableUri uri){ } else { LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); } - knownUriFilter.add(uri, System.currentTimeMillis()); + relationalUriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); } else { LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + schemeUriFilter.getSchemes() + ". Will not added!"); } } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!"); + LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + relationalUriFilter + "]. Will not be added!"); } } @@ -360,7 +361,7 @@ public void crawlingDone(List uris) { recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); addNewUri(recrawlUri); } else { - knownUriFilter.add(uri, System.currentTimeMillis()); + relationalUriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); } } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java index 6739f7a7e..132cd12e9 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java @@ -19,6 +19,7 @@ import org.apache.commons.io.IOUtils; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; @@ -41,7 +42,7 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { private final long startRunTime = System.currentTimeMillis(); private WorkerGuard workerGuard; private UriQueue queue; - private KnownUriFilter knownUriFilter; + private RelationalUriFilter relationalUriFilter; private URIReferences uriReferences; private final static String WEB_QUEUE_GENERAL_NAME = "squirrel.web.in"; private RabbitQueueFactory factory; @@ -62,11 +63,11 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { * @param knownUriFilter has information about the crawled URIs * @param uriReferences has information for the crawled graph. if it is {@code null}, the feature of creating a crawled graph is disabled */ - public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, KnownUriFilter knownUriFilter, URIReferences uriReferences) { + public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, RelationalUriFilter relationalUriFilter, URIReferences uriReferences) { this.factory = factory; this.workerGuard = workerGuard; this.queue = queue; - this.knownUriFilter = knownUriFilter; + this.relationalUriFilter = relationalUriFilter; this.uriReferences = uriReferences; } @@ -191,7 +192,7 @@ private SquirrelWebObject generateSquirrelWebObject() throws IllegalAccessExcept //Michael remarks, that's not a good idea to pass all crawled URIs, because that takes to much time... //newObject.setCrawledURIs(Collections.EMPTY_LIST); - newObject.setCountOfCrawledURIs((int) knownUriFilter.count()); + newObject.setCountOfCrawledURIs((int) relationalUriFilter.getKnownUriFilter().count()); return newObject; } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index d858d9049..ce28b37f8 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -17,6 +17,8 @@ import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.AndConcatenatingUriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; @@ -51,8 +53,10 @@ public void setUp() throws Exception { // uriGenerators.add(new WellKnownPathUriGenerator()); List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); + + RelationalUriFilter relationalUriFilter = new AndConcatenatingUriFilter(filter); - frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), filter, queue,uriGenerators,true); + frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators,true); uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE)); @@ -113,7 +117,7 @@ public void crawlingDone() throws Exception { // filter.add(uri_1, 100); frontier.crawlingDone(crawledUris); - assertFalse("uri_1 has been already crawled", frontier.knownUriFilter.isUriGood(uri_1)); + assertFalse("uri_1 has been already crawled", frontier.relationalUriFilter.isUriGood(uri_1)); } @Test diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index b319b3cbe..d10755d66 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -6,6 +6,8 @@ import org.apache.commons.collections15.map.HashedMap; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.AndConcatenatingUriFilter; +import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; import org.dice_research.squirrel.data.uri.norm.DomainBasedUriGenerator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; @@ -39,7 +41,9 @@ public void setUp() { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), new InMemoryKnownUriFilter(false, -1), queue,uriGenerators); + RelationalUriFilter relationalUriFilter = new AndConcatenatingUriFilter(new InMemoryKnownUriFilter(false, -1)); + + frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators); ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); } From abd375a146e7ea65ec0022230111357c5f0cd2b0 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 8 May 2020 19:48:51 +0200 Subject: [PATCH 061/102] created the MultipleUriFilters class, to allow the use of multiple filters. Removed of unused imports from the Generator classes and fixed tests --- spring-config/frontier-context.xml | 26 ++++-- .../data/uri/filter/MultipleUriFilters.java | 83 +++++++++++++++++++ .../data/uri/filter/UriFilterComposer.java | 9 ++ .../relational/AndConcatenatingUriFilter.java | 57 ------------- .../relational/RelationalUriFilter.java | 12 --- .../components/FrontierComponent.java | 16 +++- .../uri/norm/DomainBasedUriGenerator.java | 8 +- .../data/uri/norm/NormalizerImpl.java | 8 +- .../squirrel/data/uri/norm/UriGenerator.java | 2 - .../uri/norm/WellKnownPathUriGenerator.java | 7 +- .../frontier/impl/ExtendedFrontierImpl.java | 8 +- .../squirrel/frontier/impl/FrontierImpl.java | 16 ++-- .../impl/FrontierSenderToWebservice.java | 6 +- .../frontier/impl/FrontierImplTest.java | 6 +- .../impl/CkanSeedGeneratorImplTest.java | 6 +- 15 files changed, 154 insertions(+), 116 deletions(-) create mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java create mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java delete mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java delete mode 100644 squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 85e42ca75..cce72375e 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -66,17 +66,19 @@ - - - - - - + + + + + + + + - - @@ -86,6 +88,12 @@ + + + + + + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java new file mode 100644 index 000000000..3742f11e4 --- /dev/null +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java @@ -0,0 +1,83 @@ +package org.dice_research.squirrel.data.uri.filter; + +import java.util.ArrayList; +import java.util.List; + +import org.dice_research.squirrel.data.uri.CrawleableUri; + +/** + * + * Relational Uri Filter for the AND and OR operators + * + * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) + * + */ + +public class MultipleUriFilters implements UriFilterComposer { + + private KnownUriFilter knownUriFilter; + private List listUriFilters; + private final String OPERATOR; + + public MultipleUriFilters(KnownUriFilter knownUriFilter, List listUriFilters,String operator) { + this.knownUriFilter = knownUriFilter; + this.listUriFilters = listUriFilters; + this.OPERATOR = operator; + } + + public MultipleUriFilters(KnownUriFilter knownUriFilter,String operator) { + this.knownUriFilter = knownUriFilter; + this.listUriFilters = new ArrayList(); + this.OPERATOR = operator; + } + + @Override + public boolean isUriGood(CrawleableUri uri) { + + if(this.OPERATOR.equals("OR")) + return computeOrOperation(uri); + else + return computeAndOperation(uri); + } + + private boolean computeAndOperation(CrawleableUri uri) { + boolean isUrisGood = false; + + for (UriFilter uriFilter : listUriFilters) { + isUrisGood = uriFilter.isUriGood(uri); + if(!isUrisGood) + break; + } + + return isUrisGood && knownUriFilter.isUriGood(uri); + } + + private boolean computeOrOperation(CrawleableUri uri) { + boolean isUrisGood = false; + + for (UriFilter uriFilter : listUriFilters) { + isUrisGood = uriFilter.isUriGood(uri); + if(isUrisGood) + break; + } + + return isUrisGood || knownUriFilter.isUriGood(uri); + } + + @Override + public void add(CrawleableUri uri) { + knownUriFilter.add(uri); + listUriFilters.forEach(f -> f.add(uri)); + } + + @Override + public KnownUriFilter getKnownUriFilter() { + return this.knownUriFilter; + } + + @Override + public void setKnownUriFilter(KnownUriFilter knownUriFilter) { + this.knownUriFilter = knownUriFilter; + } + +} diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java new file mode 100644 index 000000000..0c358c81a --- /dev/null +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java @@ -0,0 +1,9 @@ +package org.dice_research.squirrel.data.uri.filter; + +public interface UriFilterComposer extends UriFilter { + + public KnownUriFilter getKnownUriFilter(); + + public void setKnownUriFilter(KnownUriFilter knownUriFilter); + +} diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java deleted file mode 100644 index 255859e73..000000000 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/AndConcatenatingUriFilter.java +++ /dev/null @@ -1,57 +0,0 @@ -package org.dice_research.squirrel.data.uri.filter.relational; - -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.UriFilter; - - -/** - * - * Relational Uri Filter for the AND operator - * - * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) - * - */ - -public class AndConcatenatingUriFilter implements RelationalUriFilter{ - - private KnownUriFilter knownUriFilter; - private UriFilter uriFilter; - - public AndConcatenatingUriFilter(KnownUriFilter knownUriFilter, UriFilter uriFilter) { - this.knownUriFilter = knownUriFilter; - this.uriFilter = uriFilter; - } - - public AndConcatenatingUriFilter(KnownUriFilter knownUriFilter) { - this.knownUriFilter = knownUriFilter; - } - - @Override - public boolean isUriGood(CrawleableUri uri) { - - if(uriFilter != null) - return knownUriFilter.isUriGood(uri) && uriFilter.isUriGood(uri); - else - return knownUriFilter.isUriGood(uri); - } - - @Override - public void add(CrawleableUri uri) { - knownUriFilter.add(uri); - uriFilter.add(uri); - } - - @Override - public KnownUriFilter getKnownUriFilter() { - return this.knownUriFilter; - } - - @Override - public void setKnownUriFilter(KnownUriFilter knownUriFilter) { - this.knownUriFilter = knownUriFilter; - } - - - -} diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java deleted file mode 100644 index b5eb03feb..000000000 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/relational/RelationalUriFilter.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.dice_research.squirrel.data.uri.filter.relational; - -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.UriFilter; - -public interface RelationalUriFilter extends UriFilter { - - public KnownUriFilter getKnownUriFilter(); - - public void setKnownUriFilter(KnownUriFilter knownUriFilter); - -} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 0d9c4b5a4..e1d159a71 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -19,9 +19,10 @@ import org.dice_research.squirrel.data.uri.UriSeedReader; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; +import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -65,7 +66,8 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa protected UriQueue queue; @Qualifier("UriFilterBean") @Autowired - private RelationalUriFilter uriFilter; + private UriFilterComposer uriFilter; + private URIReferences uriReferences = null; private Frontier frontier; private RabbitQueue rabbitQueue; @@ -80,12 +82,14 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("listUriGenerator") private List uriGenerator; + + private final Semaphore terminationMutex = new Semaphore(0); private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; private Timer timerTerminator; - + public static final boolean RECRAWLING_ACTIVE = true; @Override @@ -94,10 +98,14 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); + if (mongoConfiguration != null) { queue.open(); + uriFilter.getKnownUriFilter().open(); + + WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); if (whiteListConfiguration != null) { @@ -115,6 +123,8 @@ public void init() throws Exception { queue = new InMemoryQueue(); uriFilter.setKnownUriFilter(new InMemoryKnownUriFilter(doRecrawling, recrawlingTime)); } + + // Build frontier frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/DomainBasedUriGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/DomainBasedUriGenerator.java index c1c4dce00..895426855 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/DomainBasedUriGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/DomainBasedUriGenerator.java @@ -1,13 +1,11 @@ package org.dice_research.squirrel.data.uri.norm; -import org.apache.http.client.utils.URIBuilder; +import java.net.URI; +import java.net.URISyntaxException; + import org.dice_research.squirrel.data.uri.CrawleableUri; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; - -import java.net.URI; -import java.net.URISyntaxException; /** diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java index 0dc3a45aa..7503ba5f5 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java @@ -2,7 +2,11 @@ import java.net.URI; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -10,8 +14,6 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.stereotype.Component; /** * Parts of the code borrowed from uriGenerators, boolean doesRecrawling, + public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { super(normalizer, relationalUriFilter, queue, uriGenerators,doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian); } @@ -47,7 +47,7 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relati * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, IpAddressBasedQueue queue, List uriGenerators, boolean doesRecrawling) { + public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, IpAddressBasedQueue queue, List uriGenerators, boolean doesRecrawling) { super(normalizer, relationalUriFilter, queue, uriGenerators, doesRecrawling); } @@ -62,7 +62,7 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relati * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { + public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { super(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, doesRecrawling); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index aa046aae0..3862ef513 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -10,7 +10,7 @@ import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.SchemeBasedUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; @@ -42,7 +42,7 @@ public class FrontierImpl implements Frontier { /** * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ - protected RelationalUriFilter relationalUriFilter; + protected UriFilterComposer relationalUriFilter; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to @@ -125,7 +125,7 @@ public class FrontierImpl implements Frontier { * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue, List uriGenerators, + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { this(normalizer, relationalUriFilter, null, queue,uriGenerators, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); } @@ -149,7 +149,7 @@ public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriF * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, timerPeriod); } @@ -170,7 +170,7 @@ public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriF * @param doesRecrawling * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { this(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); @@ -190,7 +190,7 @@ public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriF * @param doesRecrawling * Value for {@link #doesRecrawling}. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators, + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling) { this(normalizer, relationalUriFilter, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); @@ -208,7 +208,7 @@ public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriF * @param queue * {@link UriQueue} used to manage the URIs that should be crawled. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, UriQueue queue,List uriGenerators) { + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators) { this(normalizer, relationalUriFilter, queue,uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); } @@ -235,7 +235,7 @@ public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriF * @param timerPeriod * used to select if URIs should be recrawled. */ - public FrontierImpl(UriNormalizer normalizer, RelationalUriFilter relationalUriFilter, URIReferences uriReferences, + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { this.normalizer = normalizer; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java index 132cd12e9..9ab91efc6 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java @@ -19,7 +19,7 @@ import org.apache.commons.io.IOUtils; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; @@ -42,7 +42,7 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { private final long startRunTime = System.currentTimeMillis(); private WorkerGuard workerGuard; private UriQueue queue; - private RelationalUriFilter relationalUriFilter; + private UriFilterComposer relationalUriFilter; private URIReferences uriReferences; private final static String WEB_QUEUE_GENERAL_NAME = "squirrel.web.in"; private RabbitQueueFactory factory; @@ -63,7 +63,7 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { * @param knownUriFilter has information about the crawled URIs * @param uriReferences has information for the crawled graph. if it is {@code null}, the feature of creating a crawled graph is disabled */ - public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, RelationalUriFilter relationalUriFilter, URIReferences uriReferences) { + public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, UriFilterComposer relationalUriFilter, URIReferences uriReferences) { this.factory = factory; this.workerGuard = workerGuard; this.queue = queue; diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index ce28b37f8..f59d5eece 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -16,9 +16,9 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; +import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.AndConcatenatingUriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; @@ -54,7 +54,7 @@ public void setUp() throws Exception { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - RelationalUriFilter relationalUriFilter = new AndConcatenatingUriFilter(filter); + UriFilterComposer relationalUriFilter = new MultipleUriFilters(filter,""); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators,true); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index d10755d66..6142bdc6c 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -6,8 +6,8 @@ import org.apache.commons.collections15.map.HashedMap; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.AndConcatenatingUriFilter; -import org.dice_research.squirrel.data.uri.filter.relational.RelationalUriFilter; +import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; +import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.norm.DomainBasedUriGenerator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; @@ -41,7 +41,7 @@ public void setUp() { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - RelationalUriFilter relationalUriFilter = new AndConcatenatingUriFilter(new InMemoryKnownUriFilter(false, -1)); + UriFilterComposer relationalUriFilter = new MultipleUriFilters(new InMemoryKnownUriFilter(false, -1),""); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators); ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); From d860b48df686fb5aecb49809133a0624d318807e Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 8 May 2020 20:12:38 +0200 Subject: [PATCH 062/102] Changed to UriFilterConfigurator --- spring-config/frontier-context.xml | 2 +- .../{MultipleUriFilters.java => UriFilterConfigurator.java} | 6 +++--- .../squirrel/components/FrontierComponent.java | 3 --- .../squirrel/frontier/impl/FrontierImplTest.java | 4 ++-- .../seed/generator/impl/CkanSeedGeneratorImplTest.java | 4 ++-- 5 files changed, 8 insertions(+), 11 deletions(-) rename squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/{MultipleUriFilters.java => UriFilterConfigurator.java} (86%) diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index cce72375e..1297bd758 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -67,7 +67,7 @@ - + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java similarity index 86% rename from squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java index 3742f11e4..086140070 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/MultipleUriFilters.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java @@ -13,19 +13,19 @@ * */ -public class MultipleUriFilters implements UriFilterComposer { +public class UriFilterConfigurator implements UriFilterComposer { private KnownUriFilter knownUriFilter; private List listUriFilters; private final String OPERATOR; - public MultipleUriFilters(KnownUriFilter knownUriFilter, List listUriFilters,String operator) { + public UriFilterConfigurator(KnownUriFilter knownUriFilter, List listUriFilters,String operator) { this.knownUriFilter = knownUriFilter; this.listUriFilters = listUriFilters; this.OPERATOR = operator; } - public MultipleUriFilters(KnownUriFilter knownUriFilter,String operator) { + public UriFilterConfigurator(KnownUriFilter knownUriFilter,String operator) { this.knownUriFilter = knownUriFilter; this.listUriFilters = new ArrayList(); this.OPERATOR = operator; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index e1d159a71..79d5b6e04 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -18,10 +18,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriSeedReader; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; -import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index f59d5eece..1d5f8f7d5 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -16,8 +16,8 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; -import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; +import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; @@ -54,7 +54,7 @@ public void setUp() throws Exception { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - UriFilterComposer relationalUriFilter = new MultipleUriFilters(filter,""); + UriFilterComposer relationalUriFilter = new UriFilterConfigurator(filter,""); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators,true); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 6142bdc6c..637c6b0ef 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -6,7 +6,7 @@ import org.apache.commons.collections15.map.HashedMap; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.MultipleUriFilters; +import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.norm.DomainBasedUriGenerator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; @@ -41,7 +41,7 @@ public void setUp() { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - UriFilterComposer relationalUriFilter = new MultipleUriFilters(new InMemoryKnownUriFilter(false, -1),""); + UriFilterComposer relationalUriFilter = new UriFilterConfigurator(new InMemoryKnownUriFilter(false, -1),""); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators); ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); From cf70c3c138e88e187cd7a5ae48e6d04b5c274d15 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 11 May 2020 16:38:08 +0200 Subject: [PATCH 063/102] fixed test with OR condition --- .../squirrel/frontier/impl/FrontierImpl.java | 16 ++++++++-------- .../squirrel/frontier/impl/FrontierImplTest.java | 7 +++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 3862ef513..d9837bb2e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -26,7 +26,7 @@ /** * Standard implementation of the {@link Frontier} interface containing a - * {@link #queue} and a {@link #relationalUriFilter}. + * {@link #queue} and a {@link #uriFilter}. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) */ @@ -42,7 +42,7 @@ public class FrontierImpl implements Frontier { /** * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ - protected UriFilterComposer relationalUriFilter; + protected UriFilterComposer uriFilter; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to @@ -239,7 +239,7 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { this.normalizer = normalizer; - this.relationalUriFilter = relationalUriFilter; + this.uriFilter = relationalUriFilter; this.uriReferences = uriReferences; this.uriGenerator = uriGenerators; this.queue = queue; @@ -300,8 +300,8 @@ public void addNewUri(CrawleableUri uri) { } protected void addNormalizedUri(CrawleableUri uri){ - if (relationalUriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + relationalUriFilter + "]"); + if (uriFilter.isUriGood(uri)) { + LOGGER.debug("addNewUri(" + uri + "): URI is good [" + uriFilter + "]"); if (schemeUriFilter.isUriGood(uri)) { LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); // Make sure that the IP is known @@ -316,14 +316,14 @@ protected void addNormalizedUri(CrawleableUri uri){ } else { LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); } - relationalUriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); } else { LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + schemeUriFilter.getSchemes() + ". Will not added!"); } } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + relationalUriFilter + "]. Will not be added!"); + LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + uriFilter + "]. Will not be added!"); } } @@ -361,7 +361,7 @@ public void crawlingDone(List uris) { recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); addNewUri(recrawlUri); } else { - relationalUriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); } } } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 1d5f8f7d5..a0046a6b3 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -21,6 +21,7 @@ import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; +import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; @@ -54,7 +55,7 @@ public void setUp() throws Exception { List sessionIDs = new ArrayList(); Map mapDefaultPort = new HashedMap(); - UriFilterComposer relationalUriFilter = new UriFilterConfigurator(filter,""); + UriFilterComposer relationalUriFilter = new UriFilterConfigurator(filter,"OR"); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators,true); @@ -117,12 +118,14 @@ public void crawlingDone() throws Exception { // filter.add(uri_1, 100); frontier.crawlingDone(crawledUris); - assertFalse("uri_1 has been already crawled", frontier.relationalUriFilter.isUriGood(uri_1)); + assertFalse("uri_1 has been already crawled", frontier.uriFilter.isUriGood(uri_1)); } @Test public void getNumberOfPendingUris() throws Exception { frontier.addNewUris(uris); +// for(CrawleableUri curi: uris) +// queue.addUri(curi); List nextUris = frontier.getNextUris(); int numberOfPendingUris = frontier.getNumberOfPendingUris(); assertEquals(1, numberOfPendingUris); From 7082746c33620f4a5b51be982942f943c4fcb34f Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 11 May 2020 16:44:52 +0200 Subject: [PATCH 064/102] added moved uri variant to addNewUri --- .../squirrel/frontier/impl/FrontierImpl.java | 714 +++++++++--------- 1 file changed, 347 insertions(+), 367 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index d9837bb2e..b15e194d4 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -32,372 +32,352 @@ */ public class FrontierImpl implements Frontier { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - - /** - * {@link UriNormalizer} used to transform given URIs into a normal form. - */ - protected UriNormalizer normalizer; - - /** - * {@link KnownUriFilter} used to identify URIs that already have been crawled. - */ - protected UriFilterComposer uriFilter; - - /** - * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to - * identify URIs that already have been crawled. - */ - protected URIReferences uriReferences = null; - - /** - * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. - */ - protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); - /** - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - protected UriQueue queue; - /** - * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, - * SPARQL, DEREFERENCEABLE or UNKNOWN - */ - protected UriProcessor uriProcessor; - /** - * {@link UriGenerator} used to generate additional domain variants of a URI - */ - protected List uriGenerator; - /** - * {@link GraphLogger} that can be added to log the crawled graph. - */ - protected GraphLogger graphLogger; - - /** - * Indicates whether recrawling is active. - */ - private boolean doesRecrawling; - - /** - * The timer that schedules the recrawling. - */ - private Timer timerRecrawling; - - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ - private static long generalRecrawlTime; - - /** - * Time interval(in milliseconds) at which the check for outdated uris is - * performed. - */ - private long timerPeriod; - - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; - - /** - * Default value for {@link #timerPeriod}. - */ - private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, List uriGenerators, - GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, relationalUriFilter, null, queue,uriGenerators, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, timerPeriod); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, - UriQueue queue,List uriGenerators, boolean doesRecrawling) { - this(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, - boolean doesRecrawling) { - this(normalizer, relationalUriFilter, queue,uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators) { - this(normalizer, relationalUriFilter, queue,uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, - UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod) { - this.normalizer = normalizer; - this.uriFilter = relationalUriFilter; - this.uriReferences = uriReferences; - this.uriGenerator = uriGenerators; - this.queue = queue; - this.uriProcessor = new UriProcessor(); - this.graphLogger = graphLogger; - - this.queue.open(); - this.doesRecrawling = doesRecrawling; - this.timerPeriod = timerPeriod; - FrontierImpl.generalRecrawlTime = generalRecrawlTime; - - if (this.doesRecrawling) { - timerRecrawling = new Timer(); - timerRecrawling.schedule(new TimerTask() { - @Override - public void run() { - List urisToRecrawl = relationalUriFilter.getKnownUriFilter().getOutdatedUris(); - urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); - } - }, this.timerPeriod, this.timerPeriod); - } - } - - @Override - public List getNextUris() { - - // if(terminationCheck.shouldFrontierTerminate(this)) { - // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); - // } - - return queue.getNextUris(); - } - - - - @Override - public void addNewUris(List uris) { - for (CrawleableUri uri : uris) { - addNewUri(uri); - try { - for (UriGenerator u : uriGenerator) { - if (u.getUriVariant(uri) != null) - addNewUri(u.getUriVariant(uri)); - } - }catch (Exception e){ - LOGGER.info("Exception happened while generating additional URI variant for URI: " + uri.getUri().toString()); - } - } - } - - @Override - public void addNewUri(CrawleableUri uri) { - // After knownUriFilter uri should be classified according to - // UriProcessor - uri = normalizer.normalize(uri); - addNormalizedUri(uri); - - } - - protected void addNormalizedUri(CrawleableUri uri){ - if (uriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + uriFilter + "]"); - if (schemeUriFilter.isUriGood(uri)) { - LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); - // Make sure that the IP is known - try { - uri = this.uriProcessor.recognizeInetAddress(uri); - - } catch (UnknownHostException e) { - LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); - } - if (uri.getIpAddress() != null) { - queue.addUri(this.uriProcessor.recognizeUriType(uri)); - } else { - LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); - } - uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); - } else { - LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " - + schemeUriFilter.getSchemes() + ". Will not added!"); - } - - } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + uriFilter + "]. Will not be added!"); - } - } - - - @Override - public void crawlingDone(List uris) { - LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); - - // List newUris = new ArrayList<>(uriMap.size()); - // for (CrawleableUri uri : uriMap.keySet()) { - // newUris.addAll(uriMap.get(uri)); - // knownUriFilter.add(uri, System.currentTimeMillis(), - // uri.getTimestampNextCrawl()); - // if (uriReferences != null) { - // uriReferences.add(uri, uriMap.get(uri)); - // } - // } - - // // If there is a graph logger, log the data - // if (graphLogger != null) { - // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); - // } - // If we should give the crawled IPs to the queue - if (queue instanceof BlockingQueue) { - ((BlockingQueue) queue).markUrisAsAccessible(uris); - } - // send list of crawled URIs to the knownUriFilter - for (CrawleableUri uri : uris) { - Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); - // If a recrawling is defined, check whether we can directly add it back to the - // queue - if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { - // Create a new uri object reusing only meta data that is useful - CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); - recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); - addNewUri(recrawlUri); - } else { - uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); - } - } - } - - @Override - public int getNumberOfPendingUris() { - // TODO this implementation does not fit to the semantics of the method name - // since it returns the number of URI groups instead of the number of URIs - if (queue instanceof BlockingQueue) { - return ((BlockingQueue) queue).getNumberOfBlockedKeys(); - } else { - return 0; - } - } - - @Override - public boolean doesRecrawling() { - return doesRecrawling; - } - - @Override - public void close() { - timerRecrawling.cancel(); - } - - public static long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - - /** - * Getter for the {@link #queue}. - * - * @return The waiting queue for the URIs. - */ - public UriQueue getQueue() { - return queue; - } + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); + + /** + * {@link UriNormalizer} used to transform given URIs into a normal form. + */ + protected UriNormalizer normalizer; + + /** + * {@link KnownUriFilter} used to identify URIs that already have been crawled. + */ + protected UriFilterComposer uriFilter; + + /** + * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to + * identify URIs that already have been crawled. + */ + protected URIReferences uriReferences = null; + + /** + * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. + */ + protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); + /** + * {@link UriQueue} used to manage the URIs that should be crawled. + */ + protected UriQueue queue; + /** + * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, + * SPARQL, DEREFERENCEABLE or UNKNOWN + */ + protected UriProcessor uriProcessor; + /** + * {@link UriGenerator} used to generate additional domain variants of a URI + */ + protected List uriGenerator; + /** + * {@link GraphLogger} that can be added to log the crawled graph. + */ + protected GraphLogger graphLogger; + + /** + * Indicates whether recrawling is active. + */ + private boolean doesRecrawling; + + /** + * The timer that schedules the recrawling. + */ + private Timer timerRecrawling; + + /** + * Time (in milliseconds) after which uris will be recrawled (only used if no + * specific time is configured for a URI). + */ + private static long generalRecrawlTime; + + /** + * Time interval(in milliseconds) at which the check for outdated uris is + * performed. + */ + private long timerPeriod; + + /** + * Default value for {@link #generalRecrawlTime} (one week). + */ + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + + /** + * Default value for {@link #timerPeriod}. + */ + private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, + long timerPeriod) { + this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, doesRecrawling, + generalRecrawlTime, timerPeriod); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, + UriHashCustodian uriHashCustodian) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, + timerPeriod); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, + UriQueue queue, List uriGenerators, boolean doesRecrawling) { + this(normalizer, relationalUriFilter, uriReferences, queue, uriGenerators, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, boolean doesRecrawling) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, + UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, + long generalRecrawlTime, long timerPeriod) { + this.normalizer = normalizer; + this.uriFilter = relationalUriFilter; + this.uriReferences = uriReferences; + this.uriGenerator = uriGenerators; + this.queue = queue; + this.uriProcessor = new UriProcessor(); + this.graphLogger = graphLogger; + + this.queue.open(); + this.doesRecrawling = doesRecrawling; + this.timerPeriod = timerPeriod; + FrontierImpl.generalRecrawlTime = generalRecrawlTime; + + if (this.doesRecrawling) { + timerRecrawling = new Timer(); + timerRecrawling.schedule(new TimerTask() { + @Override + public void run() { + List urisToRecrawl = relationalUriFilter.getKnownUriFilter().getOutdatedUris(); + urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); + } + }, this.timerPeriod, this.timerPeriod); + } + } + + @Override + public List getNextUris() { + + // if(terminationCheck.shouldFrontierTerminate(this)) { + // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); + // } + + return queue.getNextUris(); + } + + @Override + public void addNewUris(List uris) { + for (CrawleableUri uri : uris) { + addNewUri(uri); + } + } + + @Override + public void addNewUri(CrawleableUri uri) { + // After knownUriFilter uri should be classified according to + // UriProcessor + uri = normalizer.normalize(uri); + addNormalizedUri(uri); + + try { + for (UriGenerator u : uriGenerator) { + if (u.getUriVariant(uri) != null) + addNormalizedUri(normalizer.normalize(u.getUriVariant(uri))); + } + } catch (Exception e) { + LOGGER.info( + "Exception happened while generating additional URI variant for URI: " + uri.getUri().toString()); + } + } + + protected void addNormalizedUri(CrawleableUri uri) { + if (uriFilter.isUriGood(uri)) { + LOGGER.debug("addNewUri(" + uri + "): URI is good [" + uriFilter + "]"); + if (schemeUriFilter.isUriGood(uri)) { + LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); + // Make sure that the IP is known + try { + uri = this.uriProcessor.recognizeInetAddress(uri); + + } catch (UnknownHostException e) { + LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); + } + if (uri.getIpAddress() != null) { + queue.addUri(this.uriProcessor.recognizeUriType(uri)); + } else { + LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); + } + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + } else { + LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + + schemeUriFilter.getSchemes() + ". Will not added!"); + } + + } else { + LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + uriFilter + "]. Will not be added!"); + } + } + + @Override + public void crawlingDone(List uris) { + LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); + + // List newUris = new ArrayList<>(uriMap.size()); + // for (CrawleableUri uri : uriMap.keySet()) { + // newUris.addAll(uriMap.get(uri)); + // knownUriFilter.add(uri, System.currentTimeMillis(), + // uri.getTimestampNextCrawl()); + // if (uriReferences != null) { + // uriReferences.add(uri, uriMap.get(uri)); + // } + // } + + // // If there is a graph logger, log the data + // if (graphLogger != null) { + // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); + // } + // If we should give the crawled IPs to the queue + if (queue instanceof BlockingQueue) { + ((BlockingQueue) queue).markUrisAsAccessible(uris); + } + // send list of crawled URIs to the knownUriFilter + for (CrawleableUri uri : uris) { + Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); + // If a recrawling is defined, check whether we can directly add it back to the + // queue + if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { + // Create a new uri object reusing only meta data that is useful + CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); + recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); + addNewUri(recrawlUri); + } else { + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + } + } + } + + @Override + public int getNumberOfPendingUris() { + // TODO this implementation does not fit to the semantics of the method name + // since it returns the number of URI groups instead of the number of URIs + if (queue instanceof BlockingQueue) { + return ((BlockingQueue) queue).getNumberOfBlockedKeys(); + } else { + return 0; + } + } + + @Override + public boolean doesRecrawling() { + return doesRecrawling; + } + + @Override + public void close() { + timerRecrawling.cancel(); + } + + public static long getGeneralRecrawlTime() { + return generalRecrawlTime; + } + + /** + * Getter for the {@link #queue}. + * + * @return The waiting queue for the URIs. + */ + public UriQueue getQueue() { + return queue; + } } \ No newline at end of file From f567ec3367840bc0f7a87ccfe1585005e4ab4deb Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 12 May 2020 11:19:50 +0200 Subject: [PATCH 065/102] removed unused classes and logs --- .../squirrel/analyzer/commons/SquirrelClerezzaSink.java | 1 - .../org/dice_research/squirrel/analyzer/impl/RDFaAnalyzer.java | 1 - 2 files changed, 2 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/SquirrelClerezzaSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/SquirrelClerezzaSink.java index 250edc47a..4f6bfb808 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/SquirrelClerezzaSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/SquirrelClerezzaSink.java @@ -114,7 +114,6 @@ protected void addTriple(String subj, String pred, String obj) { Node p = objUrl ? NodeFactory.createURI(obj) : NodeFactory.createLiteral(obj); Triple t = new Triple(s,o,p); - System.out.println(t); sink.addTriple(curi, t); collector.addTriple(curi, t); } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFaAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFaAnalyzer.java index 20ab41c81..8da33ea21 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFaAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFaAnalyzer.java @@ -5,7 +5,6 @@ import java.io.FileWriter; import java.io.InputStream; import java.net.URI; -import java.net.URISyntaxException; import java.util.Iterator; import org.apache.tika.Tika; From af1daa168c04515efe72846d7d83fcedb3fe144a Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 22 May 2020 10:16:40 +0200 Subject: [PATCH 066/102] included the triple encoder --- .../squirrel/analyzer/AbstractAnalyzer.java | 4 + .../analyzer/commons/FilterSinkRDF.java | 17 +-- .../squirrel/analyzer/impl/JsonAnalyzer.java | 2 +- .../impl/MicroformatMF2JAnalyzer.java | 2 +- .../squirrel/analyzer/impl/RDFAnalyzer.java | 2 +- .../impl/ckan/CkanDatasetConsumer.java | 9 +- .../analyzer/impl/ckan/CkanJsonAnalyzer.java | 4 +- .../squirrel/encoder/TripleEncoder.java | 109 ++++++++++++++++++ 8 files changed, 134 insertions(+), 15 deletions(-) create mode 100644 squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java index a59b08683..20d1a1b8b 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java @@ -1,6 +1,7 @@ package org.dice_research.squirrel.analyzer; import org.dice_research.squirrel.collect.UriCollector; +import org.dice_research.squirrel.encoder.TripleEncoder; /** * Abstract class to define a constructor @@ -13,8 +14,11 @@ public abstract class AbstractAnalyzer implements Analyzer{ protected UriCollector collector; + protected TripleEncoder tripleEncoder = TripleEncoder.getInstance(); + public AbstractAnalyzer(UriCollector collector) { this.collector = collector; } + } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/FilterSinkRDF.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/FilterSinkRDF.java index c61ea7ac9..29710560d 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/FilterSinkRDF.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/commons/FilterSinkRDF.java @@ -5,9 +5,8 @@ import org.apache.jena.sparql.core.Quad; import org.dice_research.squirrel.collect.UriCollector; import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.encoder.TripleEncoder; import org.dice_research.squirrel.sink.Sink; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * @@ -23,23 +22,27 @@ public class FilterSinkRDF extends StreamRDFBase { private CrawleableUri curi; private Sink sink; private UriCollector collector; + private TripleEncoder encoder; - public FilterSinkRDF(CrawleableUri curi, Sink sink, UriCollector collector) { + public FilterSinkRDF(CrawleableUri curi, Sink sink, UriCollector collector,TripleEncoder encoder) { this.curi = curi; this.sink = sink; this.collector = collector; + this.encoder = encoder; } @Override public void triple(Triple triple) { - sink.addTriple(curi, triple); - collector.addTriple(curi, triple); + Triple t = encoder.encodeTriple(triple); + sink.addTriple(curi, t); + collector.addTriple(curi, t); } @Override public void quad(Quad quad) { - sink.addTriple(curi, quad.asTriple()); - collector.addTriple(curi, quad.asTriple()); + Triple t = encoder.encodeTriple(quad.asTriple()); + sink.addTriple(curi, t); + collector.addTriple(curi, t); } } \ No newline at end of file diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java index 6f63d882b..40578e959 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java @@ -25,7 +25,7 @@ public JsonAnalyzer(UriCollector collector) { @Override public Iterator analyze(CrawleableUri curi, File data, Sink sink) { try { - StreamRDF filtered = new FilterSinkRDF(curi, sink, collector); + StreamRDF filtered = new FilterSinkRDF(curi, sink, collector,tripleEncoder); RDFDataMgr.parse(filtered, data.getAbsolutePath(), Lang.JSONLD); return collector.getUris(curi); } catch (Exception e) { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java index 2eac0afd4..9562d9c34 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java @@ -73,7 +73,7 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { StringWriter out = new StringWriter(); model.write(out, syntax); result = out.toString(); - StreamRDF filtered = new FilterSinkRDF(curi, sink, collector); + StreamRDF filtered = new FilterSinkRDF(curi, sink, collector,tripleEncoder); RDFDataMgr.parse(filtered, new ByteArrayInputStream(result.getBytes()), Lang.NTRIPLES); } catch (Exception e) { LOGGER.warn("Could not analyze file for URI: " + curi.getUri().toString() + " :: Analyzer: " + this.getClass().getName()); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index 27ffa59ad..7bb81616a 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -77,7 +77,7 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { if ((httpMimeTypeObject != null) && (!"text/plain".equals(httpMimeTypeObject.toString()))) { contentType = httpMimeTypeObject.toString(); } - StreamRDF filtered = new FilterSinkRDF(curi, sink, collector); + StreamRDF filtered = new FilterSinkRDF(curi, sink, collector,tripleEncoder); if (contentType != null) { lang = RDFLanguages.contentTypeToLang(contentType); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanDatasetConsumer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanDatasetConsumer.java index 447ec6fd0..1cf0831d9 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanDatasetConsumer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanDatasetConsumer.java @@ -1,7 +1,6 @@ package org.dice_research.squirrel.analyzer.impl.ckan; import java.net.MalformedURLException; -import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.List; @@ -24,6 +23,7 @@ import org.apache.xerces.util.URI; import org.dice_research.squirrel.collect.UriCollector; import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.encoder.TripleEncoder; import org.dice_research.squirrel.sink.Sink; import org.dice_research.squirrel.vocab.DCAT; import org.dice_research.squirrel.vocab.VCard; @@ -50,13 +50,15 @@ public class CkanDatasetConsumer implements Consumer { protected UriCollector collector; protected CrawleableUri curi; protected String curiString; + protected TripleEncoder tripleEncoder; - public CkanDatasetConsumer(Sink sink, UriCollector collector, CrawleableUri curi) { + public CkanDatasetConsumer(Sink sink, UriCollector collector, CrawleableUri curi,TripleEncoder tripleEncoder) { super(); this.sink = sink; this.collector = collector; this.curi = curi; this.curiString = curi.getUri().toString(); + this.tripleEncoder = tripleEncoder; } /** @@ -286,6 +288,9 @@ protected void store(Resource s, Property p, RDFNode o) { if (url != null) { t = new Triple(s.asNode(), p.asNode(), NodeFactory.createURI(url.toString())); } + + t = tripleEncoder.encodeTriple(t); + sink.addTriple(curi, t); // We already know most of the Resources, so make sure that they are not part of // our current dataset diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java index 19b1b4edd..bd3bb77c5 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java @@ -52,10 +52,8 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { Stream lines = null; try { lines = Files.lines(data.toPath(), StandardCharsets.UTF_8); - sink.openSinkForUri(curi); - CkanDatasetConsumer consumer = new CkanDatasetConsumer(sink, collector, curi); + CkanDatasetConsumer consumer = new CkanDatasetConsumer(sink, collector, curi,tripleEncoder); lines.map(s -> parseDataset(s)).forEach(consumer); - sink.closeSinkForUri(curi); ActivityUtil.addStep(curi, getClass()); return collector.getUris(curi); } catch (IOException e) { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java new file mode 100644 index 000000000..2ee1e036a --- /dev/null +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java @@ -0,0 +1,109 @@ +package org.dice_research.squirrel.encoder; + +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.http.client.utils.URIBuilder; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.NodeFactory; +import org.apache.jena.graph.Triple; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.util.MultiValueMap; + +/** + * Class that can encode triples to UTF8 + * + * @author Geraldo de Souza Junior gsjunior@mail.uni-paderborn.de + * + */ +public class TripleEncoder { + + private static final Logger LOGGER = LoggerFactory.getLogger(TripleEncoder.class); + private static TripleEncoder tripleEncoder; + + private static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.toString(); + + private TripleEncoder() { + + } + + public static final TripleEncoder getInstance() { + if (tripleEncoder == null) + tripleEncoder = new TripleEncoder(); + + return tripleEncoder; + } + + public Triple encodeTriple(Triple t) { + Node s = t.getSubject(); + Node p = t.getPredicate(); + Node o = t.getObject(); + + Triple nt = null; + + try { + s = encodeUri(s); + o = encodeUri(o); + nt = new Triple(s, p, o); + } catch (URISyntaxException | UnsupportedEncodingException e) { + LOGGER.error("Could not encode triple {}", t, e); + return t; + } + + return nt; + + } + + protected Node encodeUri(Node n) throws URISyntaxException, UnsupportedEncodingException { + if(!n.isURI()) + return n; + + Map parameters = getUriParameters(n.getURI()); + + if(parameters.isEmpty()) + return n; + + String baseURI = n.toString().substring(0,n.toString().indexOf("?")); + + + URIBuilder uriBuilder = new URIBuilder(baseURI); + for(Entry param: parameters.entrySet()) + uriBuilder.addParameter(param.getKey(), param.getValue()); + + return NodeFactory.createURI(uriBuilder.toString()); + } + + + private Map getUriParameters(String uri) throws UnsupportedEncodingException { + Map mapParameters = new LinkedHashMap(); + if(uri.indexOf("?") == -1) + return mapParameters; + try { + String query = uri.substring(uri.indexOf("?") + 1); + String[] pairs = query.split("&"); + for (String pair : pairs) { + int idx = pair.indexOf("="); + mapParameters.put(pair.substring(0, idx), + pair.substring(idx + 1)); + } + }catch (IndexOutOfBoundsException e) { + return new LinkedHashMap(); + } + + return mapParameters; + } + + public static void main(String[] args) throws UnsupportedEncodingException, URISyntaxException { + TripleEncoder te = new TripleEncoder(); + System.out.println(te.encodeUri(NodeFactory.createURI("https://ckan.govdata.de/")).toString()); + } + +} From e060723111c1f059255d15d183561fc1ac856e7a Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 25 May 2020 19:30:43 +0200 Subject: [PATCH 067/102] fixed wrong constructor call --- .../components/FrontierComponent.java | 25 +- .../squirrel/frontier/impl/FrontierImpl.java | 710 +++++++++--------- 2 files changed, 357 insertions(+), 378 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 81f3d0fa5..0cba3db6f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -1,12 +1,11 @@ package org.dice_research.squirrel.components; -import org.apache.commons.io.FileUtils; import java.io.Closeable; import java.io.File; import java.io.IOException; +import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.Semaphore; @@ -17,7 +16,6 @@ import org.dice_research.squirrel.configurator.WebConfiguration; import org.dice_research.squirrel.configurator.WhiteListConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.data.uri.UriSeedReader; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; import org.dice_research.squirrel.data.uri.filter.RegexBasedWhiteListFilter; @@ -29,7 +27,12 @@ import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; import org.dice_research.squirrel.frontier.ExtendedFrontier; import org.dice_research.squirrel.frontier.Frontier; -import org.dice_research.squirrel.frontier.impl.*; +import org.dice_research.squirrel.frontier.impl.ExtendedFrontierImpl; +import org.dice_research.squirrel.frontier.impl.FrontierImpl; +import org.dice_research.squirrel.frontier.impl.FrontierSenderToWebservice; +import org.dice_research.squirrel.frontier.impl.QueueBasedTerminationCheck; +import org.dice_research.squirrel.frontier.impl.TerminationCheck; +import org.dice_research.squirrel.frontier.impl.WorkerGuard; import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -50,13 +53,6 @@ import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.concurrent.Semaphore; - @Component @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { @@ -68,7 +64,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("queueBean") @Autowired protected UriQueue queue; - private OutDatedUriRetriever outDatedUriRetriever; + private OutDatedUriRetriever outDatedUriRetriever; private UriFilterComposer uriFilter; private URIReferences uriReferences = null; @@ -90,11 +86,6 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("listUriGenerator") private List uriGenerator; - - private final Semaphore terminationMutex = new Semaphore(0); - private final WorkerGuard workerGuard = new WorkerGuard(this); - private final boolean doRecrawling = true; - private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; private Timer timerTerminator; public static final boolean RECRAWLING_ACTIVE = true; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 39b95882d..2302428da 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -16,15 +16,12 @@ import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; -import org.dice_research.squirrel.frontier.recrawling.SparqlBasedOutDatedUriRetriever; import org.dice_research.squirrel.graph.GraphLogger; import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.uri.processing.UriProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; /** * Standard implementation of the {@link Frontier} interface containing a @@ -34,361 +31,352 @@ */ public class FrontierImpl implements Frontier { - private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); - - /** - * {@link UriNormalizer} used to transform given URIs into a normal form. - */ - protected UriNormalizer normalizer; - - /** - * {@link KnownUriFilter} used to identify URIs that already have been crawled. - */ - protected KnownUriFilter knownUriFilter; - - /** - * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. - */ - protected OutDatedUriRetriever outDatedUriRetriever; - protected SparqlBasedOutDatedUriRetriever sparqlBasedOutDatedUriRetriever; - - /** - * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to - * identify URIs that already have been crawled. - */ - protected URIReferences uriReferences = null; - - /** - * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. - */ - protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); - /** - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - protected UriQueue queue; - /** - * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, - * SPARQL, DEREFERENCEABLE or UNKNOWN - */ - protected UriProcessor uriProcessor; - /** - * {@link GraphLogger} that can be added to log the crawled graph. - */ - protected GraphLogger graphLogger; - - /** - * Indicates whether recrawling is active. - */ - private boolean doesRecrawling; - - /** - * The timer that schedules the recrawling. - */ - private Timer timerRecrawling; - - /** - * Time (in milliseconds) after which uris will be recrawled (only used if no - * specific time is configured for a URI). - */ - private static long generalRecrawlTime; - - /** - * Time interval(in milliseconds) at which the check for outdated uris is - * performed. - */ - private long timerPeriod; - - /** - * Default value for {@link #generalRecrawlTime} (one week). - */ - public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; - - /** - * Default value for {@link #timerPeriod}. - */ - private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod, outDatedUriRetriever); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, - UriQueue queue, boolean doesRecrawling) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, - boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD, outDatedUriRetriever); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, OutDatedUriRetriever outDatedUriRetriever) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, outDatedUriRetriever); - } - - /** - * Constructor. - * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param graphLogger - * {@link GraphLogger} used to log graphs. - * @param doesRecrawling - * used to select if URIs should be recrawled. - * @param generalRecrawlTime - * used to select the general Time after URIs should be recrawled. If - * Value is null the default Time is used. - * @param timerPeriod - * used to select if URIs should be recrawled. - */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, - UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod) { - this.normalizer = normalizer; - this.knownUriFilter = knownUriFilter; - this.uriReferences = uriReferences; - this.queue = queue; - this.uriProcessor = new UriProcessor(); - this.graphLogger = graphLogger; - - this.queue.open(); - this.doesRecrawling = doesRecrawling; - this.timerPeriod = timerPeriod; - FrontierImpl.generalRecrawlTime = generalRecrawlTime; - - if (this.doesRecrawling) { - timerRecrawling = new Timer(); - timerRecrawling.schedule(new TimerTask() { - @Override - public void run() { - List urisToRecrawl = sparqlBasedOutDatedUriRetriever.getUriToRecrawl(); - LOGGER.info("URI to recrawl" + urisToRecrawl); - urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); - } - }, this.timerPeriod, this.timerPeriod); - } - } - - @Override - public List getNextUris() { - - // if(terminationCheck.shouldFrontierTerminate(this)) { - // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); - // } - - return queue.getNextUris(); - } - - @Override - public void addNewUris(List uris) { - for (CrawleableUri uri : uris) { - addNewUri(uri); - } - } - - @Override - public void addNewUri(CrawleableUri uri) { - // Normalize the URI - uri = normalizer.normalize(uri); - // After knownUriFilter uri should be classified according to - // UriProcessor - - if (knownUriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); - if (schemeUriFilter.isUriGood(uri)) { - LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); - // Make sure that the IP is known - try { - uri = this.uriProcessor.recognizeInetAddress(uri); - - } catch (UnknownHostException e) { - LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); - } - if (uri.getIpAddress() != null) { - queue.addUri(this.uriProcessor.recognizeUriType(uri)); - } else { - LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); - } - knownUriFilter.add(uri, System.currentTimeMillis()); - } else { - LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " - + schemeUriFilter.getSchemes() + ". Will not added!"); - } - - } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!"); - } - } - - - @Override - public void crawlingDone(List uris) { - LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); - - // List newUris = new ArrayList<>(uriMap.size()); - // for (CrawleableUri uri : uriMap.keySet()) { - // newUris.addAll(uriMap.get(uri)); - // knownUriFilter.add(uri, System.currentTimeMillis(), - // uri.getTimestampNextCrawl()); - // if (uriReferences != null) { - // uriReferences.add(uri, uriMap.get(uri)); - // } - // } - - // // If there is a graph logger, log the data - // if (graphLogger != null) { - // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); - // } - // If we should give the crawled IPs to the queue - if (queue instanceof BlockingQueue) { - ((BlockingQueue) queue).markUrisAsAccessible(uris); - } - // send list of crawled URIs to the knownUriFilter - for (CrawleableUri uri : uris) { - Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); - // If a recrawling is defined, check whether we can directly add it back to the - // queue - if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { - // Create a new uri object reusing only meta data that is useful - CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); - recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); - addNewUri(recrawlUri); - } else { - knownUriFilter.add(uri, System.currentTimeMillis()); - } - } - } - - @Override - public int getNumberOfPendingUris() { - // TODO this implementation does not fit to the semantics of the method name - // since it returns the number of URI groups instead of the number of URIs - if (queue instanceof BlockingQueue) { - return ((BlockingQueue) queue).getNumberOfBlockedKeys(); - } else { - return 0; - } - } - - @Override - public boolean doesRecrawling() { - return doesRecrawling; - } - - @Override - public void close() { - timerRecrawling.cancel(); - } - - public static long getGeneralRecrawlTime() { - return generalRecrawlTime; - } - - /** - * Getter for the {@link #queue}. - * - * @return The waiting queue for the URIs. - */ - public UriQueue getQueue() { - return queue; - } - -} + private static final Logger LOGGER = LoggerFactory.getLogger(FrontierImpl.class); + + /** + * {@link UriNormalizer} used to transform given URIs into a normal form. + */ + protected UriNormalizer normalizer; + + /** + * {@link KnownUriFilter} used to identify URIs that already have been crawled. + */ + protected UriFilterComposer uriFilter; + + /** + * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to + * identify URIs that already have been crawled. + */ + protected URIReferences uriReferences = null; + + /** + * {@link SchemeBasedUriFilter} used to identify URIs with known protocol. + */ + protected SchemeBasedUriFilter schemeUriFilter = new SchemeBasedUriFilter(); + /** + * {@link UriQueue} used to manage the URIs that should be crawled. + */ + protected UriQueue queue; + /** + * {@link UriProcessor} used to identify the type of incoming URIs: DUMP, + * SPARQL, DEREFERENCEABLE or UNKNOWN + */ + protected UriProcessor uriProcessor; + /** + * {@link UriGenerator} used to generate additional domain variants of a URI + */ + protected List uriGenerator; + /** + * {@link GraphLogger} that can be added to log the crawled graph. + */ + protected GraphLogger graphLogger; + + /** + * Indicates whether recrawling is active. + */ + private boolean doesRecrawling; + + /** + * The timer that schedules the recrawling. + */ + private Timer timerRecrawling; + + /** + * Time (in milliseconds) after which uris will be recrawled (only used if no + * specific time is configured for a URI). + */ + private static long generalRecrawlTime; + + /** + * Time interval(in milliseconds) at which the check for outdated uris is + * performed. + */ + private long timerPeriod; + + /** + * Default value for {@link #generalRecrawlTime} (one week). + */ + public static final long DEFAULT_GENERAL_RECRAWL_TIME = 1000 * 60 * 60 * 24 * 7; + + /** + * Default value for {@link #timerPeriod}. + */ + private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, + long timerPeriod) { + this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, doesRecrawling, + generalRecrawlTime, timerPeriod); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, + UriHashCustodian uriHashCustodian) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, + timerPeriod); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, + UriQueue queue, List uriGenerators, boolean doesRecrawling) { + this(normalizer, relationalUriFilter, uriReferences, queue, uriGenerators, null, doesRecrawling, + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators, boolean doesRecrawling) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs into + * a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already + * have been crawled. + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, + List uriGenerators) { + this(normalizer, relationalUriFilter, queue, uriGenerators, null, false, DEFAULT_GENERAL_RECRAWL_TIME, + DEFAULT_TIMER_PERIOD); + } + + /** + * Constructor. + * + * @param normalizer {@link UriNormalizer} used to transform given URIs + * into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that + * already have been crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that + * should be crawled. + * @param graphLogger {@link GraphLogger} used to log graphs. + * @param doesRecrawling used to select if URIs should be recrawled. + * @param generalRecrawlTime used to select the general Time after URIs should + * be recrawled. If Value is null the default Time is + * used. + * @param timerPeriod used to select if URIs should be recrawled. + */ + public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, + UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, + long generalRecrawlTime, long timerPeriod) { + this.normalizer = normalizer; + this.uriFilter = relationalUriFilter; + this.uriReferences = uriReferences; + this.uriGenerator = uriGenerators; + this.queue = queue; + this.uriProcessor = new UriProcessor(); + this.graphLogger = graphLogger; + + this.queue.open(); + this.doesRecrawling = doesRecrawling; + this.timerPeriod = timerPeriod; + FrontierImpl.generalRecrawlTime = generalRecrawlTime; + + if (this.doesRecrawling) { + timerRecrawling = new Timer(); + timerRecrawling.schedule(new TimerTask() { + @Override + public void run() { + List urisToRecrawl = relationalUriFilter.getKnownUriFilter().getOutdatedUris(); + urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); + } + }, this.timerPeriod, this.timerPeriod); + } + } + + @Override + public List getNextUris() { + + // if(terminationCheck.shouldFrontierTerminate(this)) { + // LOGGER.error("FRONTIER IS TERMINATING!", new Exception()); + // } + + return queue.getNextUris(); + } + + @Override + public void addNewUris(List uris) { + for (CrawleableUri uri : uris) { + addNewUri(uri); + } + } + + @Override + public void addNewUri(CrawleableUri uri) { + // After knownUriFilter uri should be classified according to + // UriProcessor + uri = normalizer.normalize(uri); + addNormalizedUri(uri); + + try { + for (UriGenerator u : uriGenerator) { + if (u.getUriVariant(uri) != null) + addNormalizedUri(normalizer.normalize(u.getUriVariant(uri))); + } + } catch (Exception e) { + LOGGER.info( + "Exception happened while generating additional URI variant for URI: " + uri.getUri().toString()); + } + } + + protected void addNormalizedUri(CrawleableUri uri) { + if (uriFilter.isUriGood(uri)) { + LOGGER.debug("addNewUri(" + uri + "): URI is good [" + uriFilter + "]"); + if (schemeUriFilter.isUriGood(uri)) { + LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); + // Make sure that the IP is known + try { + uri = this.uriProcessor.recognizeInetAddress(uri); + + } catch (UnknownHostException e) { + LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); + } + if (uri.getIpAddress() != null) { + queue.addUri(this.uriProcessor.recognizeUriType(uri)); + } else { + LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); + } + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + } else { + LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + + schemeUriFilter.getSchemes() + ". Will not added!"); + } + + } else { + LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + uriFilter + "]. Will not be added!"); + } + } + + @Override + public void crawlingDone(List uris) { + LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); + + // List newUris = new ArrayList<>(uriMap.size()); + // for (CrawleableUri uri : uriMap.keySet()) { + // newUris.addAll(uriMap.get(uri)); + // knownUriFilter.add(uri, System.currentTimeMillis(), + // uri.getTimestampNextCrawl()); + // if (uriReferences != null) { + // uriReferences.add(uri, uriMap.get(uri)); + // } + // } + + // // If there is a graph logger, log the data + // if (graphLogger != null) { + // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); + // } + // If we should give the crawled IPs to the queue + if (queue instanceof BlockingQueue) { + ((BlockingQueue) queue).markUrisAsAccessible(uris); + } + // send list of crawled URIs to the knownUriFilter + for (CrawleableUri uri : uris) { + Long recrawlOn = (Long) uri.getData(Constants.URI_PREFERRED_RECRAWL_ON); + // If a recrawling is defined, check whether we can directly add it back to the + // queue + if ((recrawlOn != null) && (recrawlOn < System.currentTimeMillis())) { + // Create a new uri object reusing only meta data that is useful + CrawleableUri recrawlUri = new CrawleableUri(uri.getUri(), uri.getIpAddress()); + recrawlUri.addData(Constants.URI_TYPE_KEY, uri.getData(Constants.URI_TYPE_KEY)); + addNewUri(recrawlUri); + } else { + uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + } + } + } + + @Override + public int getNumberOfPendingUris() { + // TODO this implementation does not fit to the semantics of the method name + // since it returns the number of URI groups instead of the number of URIs + if (queue instanceof BlockingQueue) { + return ((BlockingQueue) queue).getNumberOfBlockedKeys(); + } else { + return 0; + } + } + + @Override + public boolean doesRecrawling() { + return doesRecrawling; + } + + @Override + public void close() { + timerRecrawling.cancel(); + } + + public static long getGeneralRecrawlTime() { + return generalRecrawlTime; + } + + /** + * Getter for the {@link #queue}. + * + * @return The waiting queue for the URIs. + */ + public UriQueue getQueue() { + return queue; + } + +} \ No newline at end of file From 5dd058b62dacb52d971446e16e2a34ff9dba4fa1 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 27 May 2020 18:58:01 +0200 Subject: [PATCH 068/102] bean usage fix --- spring-config/frontier-context.xml | 100 +++++++++++------- .../components/FrontierComponent.java | 20 ++-- .../frontier/impl/ExtendedFrontierImpl.java | 11 +- .../squirrel/frontier/impl/FrontierImpl.java | 20 ++-- .../recrawling/DummyUriRetriever.java | 23 ++++ .../SparqlBasedOutDatedUriRetriever.java | 17 ++- .../squirrel/encoder/TripleEncoder.java | 7 -- 7 files changed, 120 insertions(+), 78 deletions(-) create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/DummyUriRetriever.java diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 585e91950..63aa07860 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -22,39 +22,39 @@ - - + + class="org.dice_research.squirrel.data.uri.norm.NormalizerImpl"> - - sessionid - jsessionids - phpsessid - sid - - - - - - - - - - - - - - - + + sessionid + jsessionids + phpsessid + sid + + + + + + + + + + + + + + + @@ -65,38 +65,58 @@ - - - + + + - - - - - + + + + + + + + + + + + + + + + - + - - - + + + - - + + - + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 0cba3db6f..914f15d0c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -3,7 +3,6 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Timer; @@ -64,7 +63,9 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("queueBean") @Autowired protected UriQueue queue; - private OutDatedUriRetriever outDatedUriRetriever; + + @Qualifier("UriFilterBean") + @Autowired private UriFilterComposer uriFilter; private URIReferences uriReferences = null; @@ -75,14 +76,18 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Autowired private Serializer serializer; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; - @Qualifier("sparqlBean") - @Autowired - private Map hasUrisToCrawl; @Qualifier("normalizerBean") @Autowired private UriNormalizer normalizer; + /** + * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. + */ + @Qualifier("uriRetrieverBean") + @Autowired + protected OutDatedUriRetriever outDatedUriRetriever; + @Qualifier("listUriGenerator") private List uriGenerator; @@ -97,9 +102,6 @@ public void init() throws Exception { MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - - hasUrisToCrawl = new HashMap(); - if (mongoConfiguration != null) { queue.open(); @@ -126,7 +128,7 @@ public void init() throws Exception { // Build frontier - frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling); + frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling, outDatedUriRetriever); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index c7f7675f5..fff99c523 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -6,10 +6,6 @@ import java.util.Set; import org.dice_research.squirrel.data.uri.CrawleableUri; - -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; -import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; - import org.dice_research.squirrel.data.uri.filter.UriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; @@ -17,9 +13,11 @@ import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.ExtendedFrontier; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.queue.IpAddressBasedQueue; import org.dice_research.squirrel.queue.UriQueue; +@SuppressWarnings("deprecation") public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFrontier { /** @@ -68,8 +66,9 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relation * crawled. * @param doesRecrawling used to select if URIs should be recrawled. */ - public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling) { - super(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, doesRecrawling); + public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue,List uriGenerators, boolean doesRecrawling, + OutDatedUriRetriever outDatedUriRetriever) { + super(normalizer, relationalUriFilter, uriReferences, queue,uriGenerators, doesRecrawling,outDatedUriRetriever); } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 2302428da..a4d46356a 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -16,6 +16,7 @@ import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; +import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.graph.GraphLogger; import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -42,6 +43,11 @@ public class FrontierImpl implements Frontier { * {@link KnownUriFilter} used to identify URIs that already have been crawled. */ protected UriFilterComposer uriFilter; + + /** + * {@link OutDatedUriRetriever} used to collect all the outdated URIs (URIs crawled a week ago) to recrawl. + */ + protected OutDatedUriRetriever outDatedUriRetriever; /** * {@link org.dice_research.squirrel.data.uri.info.URIReferences} used to @@ -122,8 +128,8 @@ public class FrontierImpl implements Frontier { public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, doesRecrawling, - generalRecrawlTime, timerPeriod); + this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, false, + generalRecrawlTime, timerPeriod,null); } /** @@ -161,9 +167,9 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil * @param doesRecrawling Value for {@link #doesRecrawling}. */ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, - UriQueue queue, List uriGenerators, boolean doesRecrawling) { + UriQueue queue, List uriGenerators,boolean doesRecrawling, OutDatedUriRetriever outDatedUriRetriever) { this(normalizer, relationalUriFilter, uriReferences, queue, uriGenerators, null, doesRecrawling, - DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); + DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD,outDatedUriRetriever); } /** @@ -218,7 +224,7 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil */ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, URIReferences uriReferences, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod) { + long generalRecrawlTime, long timerPeriod,OutDatedUriRetriever outDatedUriRetriever) { this.normalizer = normalizer; this.uriFilter = relationalUriFilter; this.uriReferences = uriReferences; @@ -226,7 +232,7 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil this.queue = queue; this.uriProcessor = new UriProcessor(); this.graphLogger = graphLogger; - + this.outDatedUriRetriever = outDatedUriRetriever; this.queue.open(); this.doesRecrawling = doesRecrawling; this.timerPeriod = timerPeriod; @@ -237,7 +243,7 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil timerRecrawling.schedule(new TimerTask() { @Override public void run() { - List urisToRecrawl = relationalUriFilter.getKnownUriFilter().getOutdatedUris(); + List urisToRecrawl = outDatedUriRetriever.getUriToRecrawl(); urisToRecrawl.forEach(uri -> queue.addUri(uriProcessor.recognizeUriType(uri))); } }, this.timerPeriod, this.timerPeriod); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/DummyUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/DummyUriRetriever.java new file mode 100644 index 000000000..e473a1f12 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/DummyUriRetriever.java @@ -0,0 +1,23 @@ +package org.dice_research.squirrel.frontier.recrawling; + +import java.util.ArrayList; +import java.util.List; + +import org.dice_research.squirrel.data.uri.CrawleableUri; + +/** + * + * A dummmy implementation to return an empty list + * + * @author Geraldo de Souza Junior - gsjunior@uni-paderborn.de + * + */ +public class DummyUriRetriever implements OutDatedUriRetriever { + + @Override + public List getUriToRecrawl() { + // TODO Auto-generated method stub + return new ArrayList (); + } + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index d2678c7de..f991c1379 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -1,7 +1,12 @@ package org.dice_research.squirrel.frontier.recrawling; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.List; + import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.core.UpdateExecutionFactory; import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; import org.apache.http.auth.AuthScope; import org.apache.http.auth.Credentials; @@ -20,12 +25,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.List; - +@SuppressWarnings("deprecation") public class SparqlBasedOutDatedUriRetriever implements OutDatedUriRetriever{ private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedOutDatedUriRetriever.class); @@ -45,9 +45,8 @@ public SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl) { return create(sparqlEndpointUrl, null, null); } - public SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl, String username, String password) { + public static SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl, String username, String password) { QueryExecutionFactory queryExecFactory; - UpdateExecutionFactory updateExecFactory; if (username != null && password != null) { // Create the factory with the credentials final Credentials credentials = new UsernamePasswordCredentials(username, password); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java index 2ee1e036a..d67bcc018 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java @@ -1,11 +1,7 @@ package org.dice_research.squirrel.encoder; import java.io.UnsupportedEncodingException; -import java.net.URI; import java.net.URISyntaxException; -import java.net.URL; -import java.net.URLDecoder; -import java.nio.charset.StandardCharsets; import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; @@ -16,7 +12,6 @@ import org.apache.jena.graph.Triple; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.util.MultiValueMap; /** * Class that can encode triples to UTF8 @@ -29,8 +24,6 @@ public class TripleEncoder { private static final Logger LOGGER = LoggerFactory.getLogger(TripleEncoder.class); private static TripleEncoder tripleEncoder; - private static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.toString(); - private TripleEncoder() { } From 7d6ee59b23ffe0ce409c49ddf0ee0c09bbb10636 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 3 Jun 2020 11:47:04 +0200 Subject: [PATCH 069/102] sparqlbean constructor arguments --- spring-config/frontier-context.xml | 4 ++-- spring-config/worker-context-sparql.out.xml | 0 spring-config/worker-context-sparql.xml | 12 ++++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 spring-config/worker-context-sparql.out.xml diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 63aa07860..7d15dfee2 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -72,10 +72,10 @@ - + - + diff --git a/spring-config/worker-context-sparql.out.xml b/spring-config/worker-context-sparql.out.xml new file mode 100644 index 000000000..e69de29bb diff --git a/spring-config/worker-context-sparql.xml b/spring-config/worker-context-sparql.xml index ca925fcec..760eeec56 100644 --- a/spring-config/worker-context-sparql.xml +++ b/spring-config/worker-context-sparql.xml @@ -38,13 +38,17 @@ + + + + + --> @@ -84,12 +88,14 @@ class="org.dice_research.squirrel.fetcher.manage.SimpleOrderedFetcherManager"> + + @@ -142,6 +148,8 @@ + + Date: Thu, 4 Jun 2020 13:18:59 +0200 Subject: [PATCH 070/102] using Jena Util URIref class --- .../squirrel/encoder/TripleEncoder.java | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java index d67bcc018..edac49371 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java @@ -2,14 +2,11 @@ import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Map.Entry; -import org.apache.http.client.utils.URIBuilder; import org.apache.jena.graph.Node; import org.apache.jena.graph.NodeFactory; import org.apache.jena.graph.Triple; +import org.apache.jena.util.URIref; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,44 +56,47 @@ protected Node encodeUri(Node n) throws URISyntaxException, UnsupportedEncoding if(!n.isURI()) return n; - Map parameters = getUriParameters(n.getURI()); - - if(parameters.isEmpty()) - return n; - - String baseURI = n.toString().substring(0,n.toString().indexOf("?")); + return NodeFactory.createURI(URIref.encode(n.toString())); - URIBuilder uriBuilder = new URIBuilder(baseURI); - for(Entry param: parameters.entrySet()) - uriBuilder.addParameter(param.getKey(), param.getValue()); - - return NodeFactory.createURI(uriBuilder.toString()); +// Map parameters = getUriParameters(n.getURI()); +// +// if(parameters.isEmpty()) +// return n; +// +// String baseURI = n.toString().substring(0,n.toString().indexOf("?")); +// +// +// URIBuilder uriBuilder = new URIBuilder(baseURI); +// for(Entry param: parameters.entrySet()) +// uriBuilder.addParameter(param.getKey(), param.getValue()); +// +// return NodeFactory.createURI(uriBuilder.toString()); } - private Map getUriParameters(String uri) throws UnsupportedEncodingException { - Map mapParameters = new LinkedHashMap(); - if(uri.indexOf("?") == -1) - return mapParameters; - try { - String query = uri.substring(uri.indexOf("?") + 1); - String[] pairs = query.split("&"); - for (String pair : pairs) { - int idx = pair.indexOf("="); - mapParameters.put(pair.substring(0, idx), - pair.substring(idx + 1)); - } - }catch (IndexOutOfBoundsException e) { - return new LinkedHashMap(); - } - - return mapParameters; - } +// private Map getUriParameters(String uri) throws UnsupportedEncodingException { +// Map mapParameters = new LinkedHashMap(); +// if(uri.indexOf("?") == -1) +// return mapParameters; +// try { +// String query = uri.substring(uri.indexOf("?") + 1); +// String[] pairs = query.split("&"); +// for (String pair : pairs) { +// int idx = pair.indexOf("="); +// mapParameters.put(pair.substring(0, idx), +// pair.substring(idx + 1)); +// } +// }catch (IndexOutOfBoundsException e) { +// return new LinkedHashMap(); +// } +// +// return mapParameters; +// } public static void main(String[] args) throws UnsupportedEncodingException, URISyntaxException { TripleEncoder te = new TripleEncoder(); - System.out.println(te.encodeUri(NodeFactory.createURI("https://ckan.govdata.de/")).toString()); + System.out.println(te.encodeUri(NodeFactory.createURI("https://ckan.govdata.de?alalao eee")).toString()); } } From 95b0d9fe02bed761065dd9ddcf717158469fe300 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 10 Jun 2020 11:27:49 +0200 Subject: [PATCH 071/102] fix frontier termination --- .../components/FrontierComponent.java | 58 +-- .../data/uri/filter/MongoDBKnowUriFilter.java | 344 +++++++++--------- 2 files changed, 210 insertions(+), 192 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 914f15d0c..2d9f5017c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.Semaphore; @@ -57,13 +58,10 @@ public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { private static final Logger LOGGER = LoggerFactory.getLogger(FrontierComponent.class); - private final Semaphore terminationMutex = new Semaphore(0); - private final WorkerGuard workerGuard = new WorkerGuard(this); - private final boolean doRecrawling = true; + @Qualifier("queueBean") @Autowired protected UriQueue queue; - @Qualifier("UriFilterBean") @Autowired private UriFilterComposer uriFilter; @@ -75,8 +73,7 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("serializerBean") @Autowired private Serializer serializer; - private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; - + @Qualifier("normalizerBean") @Autowired private UriNormalizer normalizer; @@ -91,6 +88,11 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa @Qualifier("listUriGenerator") private List uriGenerator; + + private final Semaphore terminationMutex = new Semaphore(0); + private final WorkerGuard workerGuard = new WorkerGuard(this); + private final boolean doRecrawling = true; + private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; private Timer timerTerminator; public static final boolean RECRAWLING_ACTIVE = true; @@ -101,19 +103,21 @@ public void init() throws Exception { serializer = new GzipJavaUriSerializer(); MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration(); - + if (mongoConfiguration != null) { queue.open(); uriFilter.getKnownUriFilter().open(); + WhiteListConfiguration whiteListConfiguration = WhiteListConfiguration.getWhiteListConfiguration(); if (whiteListConfiguration != null) { File whitelistFile = new File(whiteListConfiguration.getWhiteListURI()); uriFilter.setKnownUriFilter(RegexBasedWhiteListFilter.create(uriFilter.getKnownUriFilter(), whitelistFile)); } + // TODO Reactivate me but with a different configuration // if (webConfiguration.isVisualizationOfCrawledGraphEnabled()) { // uriReferences = new RDBURIReferences(rdbHostName, rdbPort); @@ -124,16 +128,16 @@ public void init() throws Exception { queue = new InMemoryQueue(); uriFilter.setKnownUriFilter(new InMemoryKnownUriFilter(doRecrawling, recrawlingTime)); } - + // Build frontier - frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling, outDatedUriRetriever); + frontier = new ExtendedFrontierImpl(normalizer, uriFilter, uriReferences, queue,uriGenerator, doRecrawling,outDatedUriRetriever); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) - .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); + .maxParallelProcessedMsgs(100).queue(rabbitQueue).build(); SeedConfiguration seedConfiguration = SeedConfiguration.getSeedConfiguration(); if (seedConfiguration != null) { @@ -146,22 +150,20 @@ public void init() throws Exception { final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, workerGuard, queue, uriFilter, uriReferences); LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " - + webConfiguration.isVisualizationOfCrawledGraphEnabled()); + + webConfiguration.isVisualizationOfCrawledGraphEnabled()); Thread senderThread = new Thread(sender); senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)"); senderThread.start(); LOGGER.info("Started thread [" + senderThread.getName() + "] "); + + senderThread.getState() + " with the priority " + senderThread.getPriority() + ">"); } else { LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to " - + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" - + webConfiguration.isVisualizationOfCrawledGraphEnabled() - + ". No WebServiceSenderThread will be started!"); + + webConfiguration.isCommunicationWithWebserviceEnabled() + "/" + + webConfiguration.isVisualizationOfCrawledGraphEnabled() + + ". No WebServiceSenderThread will be started!"); } - } - @Override public void run() throws Exception { @@ -174,8 +176,8 @@ public void close() throws IOException { timerTerminator.cancel(); if (receiver != null) // Force the receiver to close - // receiver.close(); - receiver.closeWhenFinished(); + receiver.close(); +// receiver.closeWhenFinished(); if (queue != null) queue.close(); @@ -221,7 +223,6 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu if (deserializedData instanceof UriSetRequest) { responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); } else if (deserializedData instanceof UriSet) { - if(timerTerminator == null) { LOGGER.info("Initializing Terminator task..."); @@ -248,17 +249,17 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu } private void responseToUriSetRequest(ResponseHandler handler, String responseQueueName, String correlId, - UriSetRequest uriSetRequest) { + UriSetRequest uriSetRequest) { if (handler != null) { // get next UriSet try { List uris = frontier.getNextUris(); LOGGER.trace("Responding with a list of {} uris.", - uris == null ? "null" : Integer.toString(uris.size())); + uris == null ? "null" : Integer.toString(uris.size())); handler.sendResponse(serializer.serialize(new UriSet(uris)), responseQueueName, correlId); if (uris != null && uris.size() > 0) { workerGuard.putUrisForWorker(uriSetRequest.getWorkerId(), uriSetRequest.workerSendsAliveMessages(), - uris); + uris); } } catch (IOException e) { LOGGER.error("Couldn't serialize new URI set.", e); @@ -273,10 +274,8 @@ private List initializeDepth(List listUris){ return listUris; } - protected void processSeedFile(String seedFile) { try { - List listSeeds = initializeDepth(new UriSeedReader(seedFile).getUris()); if (!listSeeds.isEmpty()) frontier.addNewUris(listSeeds); @@ -305,7 +304,7 @@ public WorkerGuard getWorkerGuard() { return workerGuard; } - private class TerminatorTask extends TimerTask { + private static class TerminatorTask extends TimerTask { private UriQueue queue; private TerminationCheck terminationCheck = new QueueBasedTerminationCheck(); @@ -324,16 +323,17 @@ public void run() { Map mapWorkers = this.workerGuard.getMapWorkerInfo(); boolean stillHasUris = false; - for (Map.Entry entry : mapWorkers.entrySet()) { + for (Entry entry : mapWorkers.entrySet()) { if (entry.getValue().getUrisCrawling().size() > 0) { stillHasUris = true; break; } } - if(!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { terminationMutex.release(); } } - }} + + } +} \ No newline at end of file diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index dec482436..4d14e8ebc 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -22,6 +22,7 @@ import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoClientOptions; @@ -33,171 +34,188 @@ import com.mongodb.client.model.Indexes; /** + * * Filter implementation for use with MongoDB - *

+ * * * @author Geraldo Souza Junior (gsjunior@mail.uni-paderborn.de) + * */ @SuppressWarnings("deprecation") public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeable, UriHashCustodian { - private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); - FrontierImpl frontierImpl; - - private MongoClient client; - private MongoDatabase mongoDB; - public static final String DB_NAME = "squirrel"; - private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds - public static final String COLLECTION_NAME = "knownurifilter"; - public static final String COLUMN_TIMESTAMP_LAST_CRAWL = "timestampLastCrawl"; - public static final String COLUMN_URI = "uri"; - public static final String COLUMN_CRAWLING_IN_PROCESS = "crawlingInProcess"; - public static final String COLUMN_TIMESTAMP_NEXT_CRAWL = "timestampNextCrawl"; - public static final String COLUMN_IP = "ipAddress"; - public static final String COLUMN_TYPE = "type"; - public static final String COLUMN_HASH_VALUE = "hashValue"; - private static final boolean PERSIST = System.getenv("QUEUE_FILTER_PERSIST") == null ? false : Boolean.parseBoolean(System.getenv("QUEUE_FILTER_PERSIST")); - /** - * Used as a default hash value for URIS, will be replaced by real hash value as soon as it has been computed. - */ - private static final String DUMMY_HASH_VALUE = "dummyValue"; - - public MongoDBKnowUriFilter(String hostName, Integer port) { - LOGGER.info("Filter Persistance: " + PERSIST); - MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { - optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); - optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); - optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); - MongoClientOptions options = optionsBuilder.build(); - client = new MongoClient(new ServerAddress(hostName, port), options); - } else { - client = new MongoClient(hostName, port); - } - } - - @Override - public boolean isUriGood(CrawleableUri uri) { - MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) - .find(new Document("uri", uri.getUri().toString())).iterator(); - if (cursor.hasNext()) { - LOGGER.debug("URI {} is not good", uri.toString()); - Document doc = cursor.next(); - Long timestampRetrieved = Long.parseLong(doc.get(COLUMN_TIMESTAMP_LAST_CRAWL).toString()); - cursor.close(); - if ((System.currentTimeMillis() - timestampRetrieved) < recrawlEveryWeek) { - return false; - } else { - return true; - } - } else { - LOGGER.debug("URI {} is good", uri.toString()); - cursor.close(); - return true; - } - - } - - @Override - public void add(CrawleableUri uri, long nextCrawlTimestamp) { - add(uri, System.currentTimeMillis(), nextCrawlTimestamp); - } - - public Document crawleableUriToMongoDocument(CrawleableUri uri) { - UriType uriType = uri.getType(); - return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); - - } - - @Override - public void close() throws IOException { - if (!PERSIST) { - mongoDB.getCollection(COLLECTION_NAME).drop(); - - } - client.close(); - } - - public void open() { - mongoDB = client.getDatabase(DB_NAME); - if (!knowUriTableExists()) { - mongoDB.createCollection(COLLECTION_NAME); - MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_NAME); - mongoCollection.createIndex(Indexes.compoundIndex(Indexes.ascending("uri"))); - } - } - - public boolean knowUriTableExists() { - for (String collection : mongoDB.listCollectionNames()) { - if (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) { - return true; - } else { - return false; - } - } - return false; - } - - @Override - public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp) { - mongoDB.getCollection(COLLECTION_NAME) - .insertOne(crawleableUriToMongoDocument(uri) - .append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) - .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) - .append(COLUMN_CRAWLING_IN_PROCESS, false) - .append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE) - ); - LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); - } - - @Override - public void addHashValuesForUris(List uris) { - - } - - public void purge() { - mongoDB.getCollection(COLLECTION_NAME).drop(); - } - - @Override - public List getOutdatedUris() { - // get all uris with the following property: - // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR lastCrawlTimestamp is 3 times older than generalRecrawlTime) - - long generalRecrawlTime = Math.max(frontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, frontierImpl.getGeneralRecrawlTime()); - - Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), - Filters.or( - Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), - Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3) - )); - - Iterator uriDocs = mongoDB.getCollection(COLLECTION_NAME).find(filter).iterator(); - - List urisToRecrawl = new ArrayList<>(); - while (uriDocs.hasNext()) { - try { - Document doc = uriDocs.next(); - String ipString = (String) doc.get(COLUMN_IP); - if (ipString.contains("/")) { - ipString = ipString.split("/")[1]; - } - urisToRecrawl.add(new CrawleableUri(new URI((String) doc.get(COLUMN_URI)), InetAddress.getByName(ipString))); - } catch (URISyntaxException | UnknownHostException e) { - LOGGER.warn(e.toString()); - } - } - - // mark that the uris are in process now - for (CrawleableUri uri : urisToRecrawl) { - BasicDBObject newDocument = new BasicDBObject(); - newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); - BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); - mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); - - } + private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); + + private MongoClient client; + private MongoDatabase mongoDB; + private int max_depth; + public static final String DB_NAME = "squirrel"; + private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds + public static final String COLLECTION_NAME = "knownurifilter"; + + public static final String COLUMN_TIMESTAMP_LAST_CRAWL = "timestampLastCrawl"; + public static final String COLUMN_URI = "uri"; + public static final String COLUMN_CRAWLING_IN_PROCESS = "crawlingInProcess"; + public static final String COLUMN_TIMESTAMP_NEXT_CRAWL = "timestampNextCrawl"; + public static final String COLUMN_IP = "ipAddress"; + public static final String COLUMN_TYPE = "type"; + public static final String COLUMN_HASH_VALUE = "hashValue"; + private static final boolean PERSIST = System.getenv("QUEUE_FILTER_PERSIST") == null ? false + : Boolean.parseBoolean(System.getenv("QUEUE_FILTER_PERSIST")); + /** + * Used as a default hash value for URIS, will be replaced by real hash value as + * soon as it has been computed. + */ + private static final String DUMMY_HASH_VALUE = "dummyValue"; + + public MongoDBKnowUriFilter(String hostName, Integer port) { + + LOGGER.info("Filter Persistance: " + PERSIST); + + MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); + MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null + && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { + optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); + optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); + optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); + + MongoClientOptions options = optionsBuilder.build(); + + client = new MongoClient(new ServerAddress(hostName, port), options); + + } else { + client = new MongoClient(hostName, port); + } + } + + @Override + public boolean isUriGood(CrawleableUri uri) { + MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME) + .find(new Document("uri", uri.getUri().toString())).iterator(); + + if (cursor.hasNext()) { + LOGGER.debug("URI {} is not good", uri.toString()); + Document doc = cursor.next(); + Long timestampRetrieved = Long.parseLong(doc.get(COLUMN_TIMESTAMP_LAST_CRAWL).toString()); + cursor.close(); + if ((System.currentTimeMillis() - timestampRetrieved) < recrawlEveryWeek) { + return false; + } else { + return true; + } + } else { + + LOGGER.debug("URI {} is good", uri.toString()); + cursor.close(); + return true; + } + + } + + @Override + public void add(CrawleableUri uri, long nextCrawlTimestamp) { + add(uri, System.currentTimeMillis(), nextCrawlTimestamp); + } + + public Document crawleableUriToMongoDocument(CrawleableUri uri) { + + UriType uriType = uri.getType(); + + return new Document("uri", uri.getUri().toString()).append("type", uriType.toString()); + + } + + @Override + public void close() throws IOException { + if (!PERSIST) { + mongoDB.getCollection(COLLECTION_NAME).drop(); + + } + client.close(); + } + + public void open() { + mongoDB = client.getDatabase(DB_NAME); + if (!knowUriTableExists()) { + mongoDB.createCollection(COLLECTION_NAME); + MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_NAME); + mongoCollection.createIndex(Indexes.compoundIndex(Indexes.ascending("uri"))); + } + } + + public boolean knowUriTableExists() { + for (String collection : mongoDB.listCollectionNames()) { + if (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) { + return true; + } else { + return false; + } + } + return false; + } + + @Override + public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp) { + mongoDB.getCollection(COLLECTION_NAME) + .insertOne(crawleableUriToMongoDocument(uri).append(COLUMN_TIMESTAMP_LAST_CRAWL, lastCrawlTimestamp) + .append(COLUMN_TIMESTAMP_NEXT_CRAWL, nextCrawlTimestamp) + .append(COLUMN_CRAWLING_IN_PROCESS, false).append(COLUMN_HASH_VALUE, DUMMY_HASH_VALUE)); + LOGGER.debug("Adding URI {} to the known uri filter list", uri.toString()); + } + + @Override + public void addHashValuesForUris(List uris) { + + } + + public void purge() { + mongoDB.getCollection(COLLECTION_NAME).drop(); + } + + @Override + public List getOutdatedUris() { + // get all uris with the following property: + // (nextCrawlTimestamp has passed) AND (crawlingInProcess==false OR + // lastCrawlTimestamp is 3 times older than generalRecrawlTime) + + long generalRecrawlTime = Math.max(FrontierImpl.DEFAULT_GENERAL_RECRAWL_TIME, + FrontierImpl.getGeneralRecrawlTime()); + + Bson filter = Filters.and(Filters.eq("COLUMN_TIMESTAMP_NEXT_CRAWL", System.currentTimeMillis()), Filters.or( + Filters.eq("COLUMN_CRAWLING_IN_PROCESS", false), + Filters.eq("COLUMN_TIMESTAMP_LAST_CRAWL", System.currentTimeMillis() - generalRecrawlTime * 3))); + + Iterator uriDocs = mongoDB.getCollection(COLLECTION_NAME).find(filter).iterator(); + + List urisToRecrawl = new ArrayList<>(); + while (uriDocs.hasNext()) { + try { + Document doc = uriDocs.next(); + String ipString = (String) doc.get(COLUMN_IP); + if (ipString.contains("/")) { + ipString = ipString.split("/")[1]; + } + urisToRecrawl + .add(new CrawleableUri(new URI((String) doc.get(COLUMN_URI)), InetAddress.getByName(ipString))); + } catch (URISyntaxException | UnknownHostException e) { + LOGGER.warn(e.toString()); + } + } + + // mark that the uris are in process now + for (CrawleableUri uri : urisToRecrawl) { + + BasicDBObject newDocument = new BasicDBObject(); + newDocument.append("$set", new BasicDBObject().append(COLUMN_CRAWLING_IN_PROCESS, true)); + + BasicDBObject searchQuery = new BasicDBObject().append(COLUMN_URI, uri.getUri().toString()); + + mongoDB.getCollection(COLLECTION_NAME).updateMany(searchQuery, newDocument); + + } // cursor.close(); return urisToRecrawl; @@ -209,10 +227,10 @@ public long count() { return 0; } - @Override - public Set getUrisWithSameHashValues(Set hashValuesForComparison) { - // TODO Auto-generated method stub - return null; - } + @Override + public Set getUrisWithSameHashValues(Set hashValuesForComparison) { + // TODO Auto-generated method stub + return null; + } -} +} \ No newline at end of file From fa52133e7f0f38ce2a3a6b5ec053851e852a753b Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 29 Jun 2020 10:50:08 +0200 Subject: [PATCH 072/102] close version 0.4 --- pom.xml | 2 +- squirrel.api/pom.xml | 2 +- squirrel.deduplication/pom.xml | 2 +- squirrel.frontier/pom.xml | 2 +- squirrel.mockup/pom.xml | 2 +- squirrel.web-api/pom.xml | 2 +- squirrel.web/pom.xml | 2 +- squirrel.worker/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 33016377b..c120a82a1 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.dice-research squirrel - 0.5.0 + 0.4.0 pom 2017 Squirrel diff --git a/squirrel.api/pom.xml b/squirrel.api/pom.xml index 7d3f9f061..cb1c2263f 100644 --- a/squirrel.api/pom.xml +++ b/squirrel.api/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.api jar diff --git a/squirrel.deduplication/pom.xml b/squirrel.deduplication/pom.xml index 87ac2fd69..6ed00c730 100644 --- a/squirrel.deduplication/pom.xml +++ b/squirrel.deduplication/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.deduplication jar diff --git a/squirrel.frontier/pom.xml b/squirrel.frontier/pom.xml index 147e1ac18..de96f883d 100644 --- a/squirrel.frontier/pom.xml +++ b/squirrel.frontier/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.frontier jar diff --git a/squirrel.mockup/pom.xml b/squirrel.mockup/pom.xml index 781985e14..756e3e0d0 100644 --- a/squirrel.mockup/pom.xml +++ b/squirrel.mockup/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.mockup jar diff --git a/squirrel.web-api/pom.xml b/squirrel.web-api/pom.xml index 30ccf64e5..e19483e9e 100644 --- a/squirrel.web-api/pom.xml +++ b/squirrel.web-api/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.web-api diff --git a/squirrel.web/pom.xml b/squirrel.web/pom.xml index 9b4375395..3f9b53b79 100644 --- a/squirrel.web/pom.xml +++ b/squirrel.web/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.web jar diff --git a/squirrel.worker/pom.xml b/squirrel.worker/pom.xml index ff0431b0a..7fd7eca72 100644 --- a/squirrel.worker/pom.xml +++ b/squirrel.worker/pom.xml @@ -7,7 +7,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.worker jar From f93f1761ae4dd7f8c350b5dbb8958edcd2360dc1 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 29 Jun 2020 17:19:56 +0200 Subject: [PATCH 073/102] triple encoder test --- .../squirrel/encoder/TripleEncoderTest.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java new file mode 100644 index 000000000..53cbddc2c --- /dev/null +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java @@ -0,0 +1,66 @@ +package org.dice_research.squirrel.encoder; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.jena.graph.NodeFactory; +import org.apache.jena.graph.Triple; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + + +/** + * + * Test for Triple Encoder, available on the Abstract Analyzer + * + * @author Geraldo de Souza Junior - gsjunior@mail.uni-paderborn.de + * + */ + +public class TripleEncoderTest { + + private List listUncodedTriples; + private List listExpectedTriples; + + @Before + public void createUris() { + listUncodedTriples = new ArrayList(); + listExpectedTriples = new ArrayList(); + + for (int i = 0; i < 10; i++) { + String s = "http://dice-research.org/Squirrel/?triple= " + i + " statement " + i; + String p = "http://dice-research.org/Squirrel/predicate" + i; + String o = "http://dice-research.org/Squirrel/?triple= " + i + " object " + i; + + Triple tu = new Triple(NodeFactory.createURI(s), NodeFactory.createURI(p), NodeFactory.createURI(o)); + + listUncodedTriples.add(tu); + + s = "http://dice-research.org/Squirrel/?triple=%20" + i + "%20statement%20" + i; + p = "http://dice-research.org/Squirrel/predicate" + i; + o = "http://dice-research.org/Squirrel/?triple=%20" + i + "%20object%20" + i; + + Triple te = new Triple(NodeFactory.createURI(s), NodeFactory.createURI(p), NodeFactory.createURI(o)); + + listExpectedTriples.add(te); + + } + + } + + @Test + public void testEncoding() { + + TripleEncoder encoder = TripleEncoder.getInstance(); + + for (int i = 0; i < 10; i++) { + System.out.println(encoder.encodeTriple(listUncodedTriples.get(i))); + System.out.println(listExpectedTriples.get(i)); + + Assert.assertEquals(listExpectedTriples.get(i), encoder.encodeTriple(listUncodedTriples.get(i))); + } + + } + +} From 914558ed732f2ce06564eb04cf3264186fe4fa32 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Mon, 29 Jun 2020 17:35:29 +0200 Subject: [PATCH 074/102] commented outputs print --- .../org/dice_research/squirrel/encoder/TripleEncoderTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java index 53cbddc2c..6ec010a1a 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java @@ -55,8 +55,8 @@ public void testEncoding() { TripleEncoder encoder = TripleEncoder.getInstance(); for (int i = 0; i < 10; i++) { - System.out.println(encoder.encodeTriple(listUncodedTriples.get(i))); - System.out.println(listExpectedTriples.get(i)); +// System.out.println(encoder.encodeTriple(listUncodedTriples.get(i))); +// System.out.println(listExpectedTriples.get(i)); Assert.assertEquals(listExpectedTriples.get(i), encoder.encodeTriple(listUncodedTriples.get(i))); } From 69417231d8e647bade61de7fa17ef422db6eef64 Mon Sep 17 00:00:00 2001 From: Geraldo S Jr Date: Mon, 29 Jun 2020 17:39:29 +0200 Subject: [PATCH 075/102] Update pom.xml reduced version --- squirrel.reports/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/squirrel.reports/pom.xml b/squirrel.reports/pom.xml index 2e4486f0f..6450a0ebe 100644 --- a/squirrel.reports/pom.xml +++ b/squirrel.reports/pom.xml @@ -6,7 +6,7 @@ org.dice-research squirrel - 0.5.0 + 0.4.0 squirrel.reports From b3e7004d3fbb174dd5bfa2776dbfca828dae5c52 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 8 Jul 2020 16:38:45 +0200 Subject: [PATCH 076/102] removed unused import --- .../java/org/dice_research/squirrel/data/uri/CrawleableUri.java | 1 - 1 file changed, 1 deletion(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java index a20148f7b..e9af597de 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java @@ -1,6 +1,5 @@ package org.dice_research.squirrel.data.uri; -import org.apache.jena.rdf.model.Property; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From 11b7af1a33b2dab89c36a861d915ff1e3f66cb89 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 8 Jul 2020 17:39:23 +0200 Subject: [PATCH 077/102] updated javadoc --- .../squirrel/data/uri/filter/UriFilterComposer.java | 9 +++++++++ .../data/uri/filter/UriFilterConfigurator.java | 13 ++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java index 0c358c81a..b2a6774bc 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java @@ -1,5 +1,14 @@ package org.dice_research.squirrel.data.uri.filter; +/** + * + * This class represents a composition of two or more filters, + * requiring at least one @link {org.dice_research.squirrel.data.uri.filter.KnownUriFilter} + * + * * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) + * + */ + public interface UriFilterComposer extends UriFilter { public KnownUriFilter getKnownUriFilter(); diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java index 086140070..994190abc 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterConfigurator.java @@ -7,7 +7,18 @@ /** * - * Relational Uri Filter for the AND and OR operators + * This class represents a Relational Uri Filter for the AND and OR operators + * It is possible to compose two or more filters and use then with relational operators + * + * The constructor requires at least one @link {org.dice_research.squirrel.data.uri.filter.KnownUriFilter} + * a list of @link {org.dice_research.squirrel.data.uri.filter.UriFilter} + * and the Operator. The operator can be <> or <>. + * + * If the Operator is <>, the isUriGood(CrawleableUri) method will return true if + * all the UriFilters will return true in their respectives isUriGood(CrawleableUri) methods + * + * In case the Operator is <>, if at least one of then returns true, the isUriGood(CrawleableUri) method will return true. + * * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) * From 5a79b24ffbdcef9f2b80fc816eb002b4af69a901 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 8 Jul 2020 17:42:20 +0200 Subject: [PATCH 078/102] updated javadoc --- .../squirrel/data/uri/filter/UriFilterComposer.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java index b2a6774bc..25344a135 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/UriFilterComposer.java @@ -11,8 +11,19 @@ public interface UriFilterComposer extends UriFilter { + /** + * Returnsthe KnowUriFilter from this {@link UriFilterComposer} + * + * @return KnownUriFilter + */ public KnownUriFilter getKnownUriFilter(); + + /** + * Set the KnowUriFilter for this {@link UriFilterComposer} + * + * @param knownUriFilter + */ public void setKnownUriFilter(KnownUriFilter knownUriFilter); } From 37fff06523f32cbd8940e29de9c3b19dceffbdff Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 8 Jul 2020 17:43:53 +0200 Subject: [PATCH 079/102] removed unused import --- .../squirrel/queue/domainbased/MongoDBDomainBasedQueue.java | 1 - 1 file changed, 1 deletion(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java index 82df936a4..f024bf83f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainBasedQueue.java @@ -12,7 +12,6 @@ import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; import org.dice_research.squirrel.queue.AbstractDomainBasedQueue; -import org.rdfhdt.hdt.util.Histogram; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From ed4057fecc7c4c483630d9ae107d81e6956d8169 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Wed, 8 Jul 2020 18:15:37 +0200 Subject: [PATCH 080/102] javadoc added --- .../squirrel/encoder/TripleEncoder.java | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java index edac49371..a12fc6764 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java @@ -11,7 +11,10 @@ import org.slf4j.LoggerFactory; /** - * Class that can encode triples to UTF8 + * Class that can encode triples + * + * The encodeTriple will escape the triple's resource and object following Jena escaping rules + * present on {@link NodeFactory} * * @author Geraldo de Souza Junior gsjunior@mail.uni-paderborn.de * @@ -31,6 +34,15 @@ public static final TripleEncoder getInstance() { return tripleEncoder; } + + /** + * + * Method that encode triple based on Jena escaping rules. + * @param Triple + * the triple that will be encoded + * @return Triple + * the escaped triple + */ public Triple encodeTriple(Triple t) { Node s = t.getSubject(); @@ -52,7 +64,7 @@ public Triple encodeTriple(Triple t) { } - protected Node encodeUri(Node n) throws URISyntaxException, UnsupportedEncodingException { + private Node encodeUri(Node n) throws URISyntaxException, UnsupportedEncodingException { if(!n.isURI()) return n; @@ -94,9 +106,6 @@ protected Node encodeUri(Node n) throws URISyntaxException, UnsupportedEncoding // return mapParameters; // } - public static void main(String[] args) throws UnsupportedEncodingException, URISyntaxException { - TripleEncoder te = new TripleEncoder(); - System.out.println(te.encodeUri(NodeFactory.createURI("https://ckan.govdata.de?alalao eee")).toString()); - } + } From 26e2eeeaecd4d3ca0e7cab90e4734f5ffc48c318 Mon Sep 17 00:00:00 2001 From: Stefan Birkner Date: Thu, 9 Jul 2020 21:42:55 +0200 Subject: [PATCH 081/102] Fix SimpleHttpServerComponentStarter The EnvironmentsVariable rule can only be used within a JUnit 4 test class because it needs JUnit 4 for managing its lifecycle. System Lambda is a replacement for System Rules that does not need JUnit 4 anymore. Therefore we can use it in SimpleHttpServerComponentStarter. --- pom.xml | 6 ++--- squirrel.mockup/pom.xml | 5 ++--- .../SimpleHttpServerComponentStarter.java | 22 +++++++++---------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/pom.xml b/pom.xml index a88909b56..514118a97 100644 --- a/pom.xml +++ b/pom.xml @@ -345,11 +345,11 @@ 4.12 test - + com.github.stefanbirkner - system-rules - test + system-lambda + 1.0.0 diff --git a/squirrel.mockup/pom.xml b/squirrel.mockup/pom.xml index 781985e14..e1db724cc 100644 --- a/squirrel.mockup/pom.xml +++ b/squirrel.mockup/pom.xml @@ -25,11 +25,10 @@ compile - + com.github.stefanbirkner - system-rules - 1.17.0 + system-lambda test diff --git a/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java b/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java index 71c9f9e44..fd10e40c5 100644 --- a/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java +++ b/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java @@ -2,20 +2,20 @@ import org.hobbit.core.run.ComponentStarter; import org.junit.Ignore; -import org.junit.contrib.java.lang.system.EnvironmentVariables; + +import static com.github.stefanbirkner.systemlambda.SystemLambda.withEnvironmentVariable; @Ignore public class SimpleHttpServerComponentStarter { - public static final EnvironmentVariables environmentVariables = new EnvironmentVariables(); - - public static void main(String[] args) { - environmentVariables.set("RESOURCE_MODEL", "../deployment/scenarios/2/nodeA.ttl"); - environmentVariables.set("ROBOTS_TXT", "../deployment/scenarios/2/robotsA.txt"); - environmentVariables.set("RESOURCE_MODEL_LANG", "N3"); - environmentVariables.set("PORT", "8080"); - environmentVariables.set("USE_DEREF", "true"); - - ComponentStarter.main(new String[] { "org.dice_research.squirrel.components.SimpleHttpServerComponent" }); + public static void main(String[] args) throws Exception { + withEnvironmentVariable("RESOURCE_MODEL", "../deployment/scenarios/2/nodeA.ttl") + .and("ROBOTS_TXT", "../deployment/scenarios/2/robotsA.txt") + .and("RESOURCE_MODEL_LANG", "N3") + .and("PORT", "8080") + .and("USE_DEREF", "true") + .execute(() -> + ComponentStarter.main(new String[] { "org.dice_research.squirrel.components.SimpleHttpServerComponent" }) + ); } } From 68841c0a6eb17216c4c541c36aa73a6949ab54a5 Mon Sep 17 00:00:00 2001 From: Micha Date: Fri, 10 Jul 2020 13:49:37 +0200 Subject: [PATCH 082/102] Replaced static import. --- pom.xml | 2 +- .../configurator/SimpleHttpServerComponentStarter.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 244fcbb25..6a643a3ba 100644 --- a/pom.xml +++ b/pom.xml @@ -349,7 +349,7 @@ com.github.stefanbirkner system-lambda - 1.0.0 + 1.0.0 diff --git a/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java b/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java index fd10e40c5..b8c154563 100644 --- a/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java +++ b/squirrel.mockup/src/test/java/org/dice_research/squirrel/configurator/SimpleHttpServerComponentStarter.java @@ -3,13 +3,13 @@ import org.hobbit.core.run.ComponentStarter; import org.junit.Ignore; -import static com.github.stefanbirkner.systemlambda.SystemLambda.withEnvironmentVariable; +import com.github.stefanbirkner.systemlambda.SystemLambda; @Ignore public class SimpleHttpServerComponentStarter { public static void main(String[] args) throws Exception { - withEnvironmentVariable("RESOURCE_MODEL", "../deployment/scenarios/2/nodeA.ttl") + SystemLambda.withEnvironmentVariable("RESOURCE_MODEL", "../deployment/scenarios/2/nodeA.ttl") .and("ROBOTS_TXT", "../deployment/scenarios/2/robotsA.txt") .and("RESOURCE_MODEL_LANG", "N3") .and("PORT", "8080") From 0629a891bd5237f1ef4b2a07e03fbbfdeae14601 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 11:59:36 +0200 Subject: [PATCH 083/102] code quality review --- .../squirrel/vocab/Squirrel.java | 20 ++++++++++--------- .../squirrel/data/uri/filter/DepthFilter.java | 11 +++++----- .../data/uri/filter/MongoDBKnowUriFilter.java | 2 -- .../squirrel/frontier/impl/FrontierImpl.java | 2 +- .../recrawling/OutDatedUriRetriever.java | 4 ---- .../SparqlBasedOutDatedUriRetriever.java | 4 ++-- .../frontier/impl/FrontierImplTest.java | 10 +--------- 7 files changed, 20 insertions(+), 33 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java index 4810edfa2..4e8dfb38d 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/Squirrel.java @@ -11,6 +11,16 @@ public class Squirrel { * The namespace of the vocabulary as a string */ public static final String uri = Constants.SQUIRREL_URI_PREFIX + "/vocab#"; + + public static final Resource ResultGraph = resource("ResultGraph"); + public static final Resource ResultFile = resource("ResultFile"); + + public static final Property approxNumberOfTriples = property("approxNumberOfTriples"); + public static final Property depth = property("depth"); + public static final Property crawled = property("crawled"); + public static final Property uriHostedOn = property("uriHostedOn"); + public static final Property status = property("status"); + public static final Property containsDataOf = property("containsDataOf"); /** * returns the URI for this schema @@ -29,14 +39,6 @@ protected static final Property property(String local) { return ResourceFactory.createProperty(uri, local); } - public static final Resource ResultGraph = resource("ResultGraph"); - public static final Resource ResultFile = resource("ResultFile"); - - public static final Property approxNumberOfTriples = property("approxNumberOfTriples"); - public static final Property depth = property("depth"); - public static final Property crawled = property("crawled"); - public static final Property uriHostedOn = property("uriHostedOn"); - public static final Property status = property("status"); - public static final Property containsDataOf = property("containsDataOf"); + } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java index 4e145231a..beed26f5f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/DepthFilter.java @@ -9,6 +9,8 @@ * * Depth Filter implementation * + * Check if the uriIsGood based on the current depth level + * * * @author Geraldo Souza Junior (gsjunior@mail.uni-paderborn.de) * */ @@ -17,12 +19,12 @@ public class DepthFilter implements UriFilter { private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBKnowUriFilter.class); - + /** + * The maximum depth allowed for the filter + */ private int max_depth; - - public DepthFilter(int max_depth) { this.max_depth = max_depth; } @@ -48,9 +50,6 @@ public boolean isUriGood(CrawleableUri uri) { } - public void purge() { - - } @Override public void add(CrawleableUri uri) { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 4d14e8ebc..db2d2a272 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -13,7 +13,6 @@ import org.bson.Document; import org.bson.conversions.Bson; -import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriType; @@ -48,7 +47,6 @@ public class MongoDBKnowUriFilter implements KnownUriFilter, Cloneable, Closeabl private MongoClient client; private MongoDatabase mongoDB; - private int max_depth; public static final String DB_NAME = "squirrel"; private Integer recrawlEveryWeek = 60 * 60 * 24 * 7 * 1000; // in miiliseconds public static final String COLLECTION_NAME = "knownurifilter"; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index a4d46356a..3af84e371 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -128,7 +128,7 @@ public class FrontierImpl implements Frontier { public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, List uriGenerators, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, false, + this(normalizer, relationalUriFilter, null, queue, uriGenerators, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod,null); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java index 3ccdcbb90..639689cb1 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/OutDatedUriRetriever.java @@ -1,11 +1,7 @@ package org.dice_research.squirrel.frontier.recrawling; -import java.io.Closeable; -import java.io.IOException; -import java.net.URI; import java.util.List; -import com.sun.jndi.toolkit.url.Uri; import org.dice_research.squirrel.data.uri.CrawleableUri; public interface OutDatedUriRetriever{ diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index f991c1379..d8ea2eb1c 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -33,8 +33,8 @@ public class SparqlBasedOutDatedUriRetriever implements OutDatedUriRetriever{ /** * SparqlBasedOutDatedUriRetriever creates a connection to the SPARQL endpoint and Query factory used to generate a query. */ - QueryExecutionFactory queryExecFactory; - List urisToRecrawl = new ArrayList<>(); + private QueryExecutionFactory queryExecFactory; + private List urisToRecrawl = new ArrayList<>(); public SparqlBasedOutDatedUriRetriever(QueryExecutionFactory queryExecFactory) { this.queryExecFactory = queryExecFactory; diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index a5fbd0956..aaf9f5088 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -18,24 +18,16 @@ import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; -import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; +import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; -import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import java.net.InetAddress; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; - -import static org.junit.Assert.*; - @SuppressWarnings("deprecation") public class FrontierImplTest { From ab41ee058904ddf88590598d5318cae749101f0a Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 12:18:45 +0200 Subject: [PATCH 084/102] codacy fix --- .../frontier/impl/RecrawlingTest.java | 19 +++++++++++++------ .../domainbased/MongoDBDomainQueueTest.java | 1 - .../squirrel/collect/UriCollector.java | 1 - .../analyzer/impl/RDFAnalyzerTest.java | 6 +++--- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 1043a1932..3955bf904 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -1,8 +1,19 @@ package org.dice_research.squirrel.frontier.impl; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.Calendar; + import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactoryDataset; -import org.apache.jena.query.*; +import org.apache.jena.query.Dataset; +import org.apache.jena.query.DatasetFactory; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.QuerySolution; +import org.apache.jena.query.ResultSet; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; @@ -10,15 +21,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Calendar; - -import static org.junit.Assert.*; - public class RecrawlingTest { private static final Logger LOGGER = LoggerFactory.getLogger(RecrawlingTest.class); @Test - public void Recrawling(){ + public void recrawling(){ Dataset dataset = DatasetFactory.create(); dataset.setDefaultModel(ModelFactory.createDefaultModel().read("test.ttl")); QueryExecutionFactory queryExecFactory = new QueryExecutionFactoryDataset(dataset); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java index c3d7a816c..d99bd21d3 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/queue/domainbased/MongoDBDomainQueueTest.java @@ -122,7 +122,6 @@ public void getUris() throws Exception { mongodbQueue.addUri(uri); } - List listUris = mongodbQueue.getNextUris(); Iterator iter = mongodbQueue.getGroupIterator(); int count = 0; while (iter.hasNext()) { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java index a54b71d6d..05fa7a9b7 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/collect/UriCollector.java @@ -5,7 +5,6 @@ import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; -import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.sink.SinkBase; diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 70eaab0a0..9b683b830 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -32,9 +32,9 @@ @RunWith(Parameterized.class) public class RDFAnalyzerTest { - long startTime; - long endTime; - static long totalTime; + private long startTime; + private long endTime; + private static long totalTime; @Rule public Stopwatch stopwatch = new Stopwatch() { From 8daff3e8657c3afd9245d1c15c038a2cb5d13106 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 12:24:56 +0200 Subject: [PATCH 085/102] codacy fix --- .../dice_research/squirrel/frontier/impl/FrontierImpl.java | 3 +-- .../squirrel/frontier/impl/FrontierSenderToWebservice.java | 1 - .../squirrel/analyzer/impl/RDFAnalyzerTest.java | 5 +++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 3af84e371..67d2e38c6 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -148,8 +148,7 @@ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFil * @param timerPeriod used to select if URIs should be recrawled. */ public FrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue, - List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, - UriHashCustodian uriHashCustodian) { + List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { this(normalizer, relationalUriFilter, queue, uriGenerators, null, doesRecrawling, generalRecrawlTime, timerPeriod); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java index 9ab91efc6..a116a7298 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java @@ -18,7 +18,6 @@ import org.apache.commons.io.IOUtils; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.serialize.Serializer; diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java index 9b683b830..d544de8ab 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzerTest.java @@ -35,6 +35,8 @@ public class RDFAnalyzerTest { private long startTime; private long endTime; private static long totalTime; + private String resourceName; + private int expectedNumberOfTriples; @Rule public Stopwatch stopwatch = new Stopwatch() { @@ -49,8 +51,7 @@ public static void afterClass() { System.err.println(String.format("RDFAnalyzerTest total time: %d", totalTime)); } - private String resourceName; - private int expectedNumberOfTriples; + public RDFAnalyzerTest(String resourceName, int expectedNumberOfTriples) { this.resourceName = resourceName; From 5a2e4316da879cbe66170c73865bde34da2899af Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 12:34:56 +0200 Subject: [PATCH 086/102] constructor fix --- .../squirrel/frontier/impl/ExtendedFrontierImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index fff99c523..09c88affb 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -37,7 +37,7 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti public ExtendedFrontierImpl(UriNormalizer normalizer, UriFilterComposer relationalUriFilter, UriQueue queue,List uriGenerators, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - super(normalizer, relationalUriFilter, queue, uriGenerators,doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian); + super(normalizer, relationalUriFilter, queue, uriGenerators,doesRecrawling, generalRecrawlTime, timerPeriod); } /** From 444bce33f4db6c68277fa7f36ea79b58360d5397 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 12:46:56 +0200 Subject: [PATCH 087/102] dummy method fix --- .../frontier/recrawling/SparqlBasedOutDatedUriRetriever.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index d8ea2eb1c..2fb595e0b 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -53,6 +53,7 @@ public static SparqlBasedOutDatedUriRetriever create(String sparqlEndpointUrl, S HttpAuthenticator authenticator = new HttpAuthenticator() { @Override public void invalidate() { + //TODO dummy method } @Override @@ -60,6 +61,8 @@ public void apply(AbstractHttpClient client, HttpContext httpContext, URI target client.setCredentialsProvider(new CredentialsProvider() { @Override public void clear() { + //TODO dummy method + } @Override From 3efe75083acde4b3b43d15f2a50ca2b221ba2ed9 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 12:50:25 +0200 Subject: [PATCH 088/102] removed unused import --- .../org/dice_research/squirrel/frontier/impl/FrontierImpl.java | 1 - 1 file changed, 1 deletion(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index 67d2e38c6..a927fe802 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -14,7 +14,6 @@ import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.norm.UriGenerator; import org.dice_research.squirrel.data.uri.norm.UriNormalizer; -import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.Frontier; import org.dice_research.squirrel.frontier.recrawling.OutDatedUriRetriever; import org.dice_research.squirrel.graph.GraphLogger; From d7cd329b8bc7b765e5462b5410d49325db7cb737 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 13:14:09 +0200 Subject: [PATCH 089/102] created a mongoDBConnectionFactory to avoid duplicated code --- .../data/uri/filter/MongoDBKnowUriFilter.java | 20 +-------- .../mongodb/MongoDBConnectionFactory.java | 44 +++++++++++++++++++ .../queue/ipbased/MongoDBIpBasedQueue.java | 21 +-------- 3 files changed, 48 insertions(+), 37 deletions(-) create mode 100644 squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index db2d2a272..04d561452 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -13,19 +13,17 @@ import org.bson.Document; import org.bson.conversions.Bson; -import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.deduplication.hashing.HashValue; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.impl.FrontierImpl; +import org.dice_research.squirrel.mongodb.MongoDBConnectionFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; -import com.mongodb.MongoClientOptions; -import com.mongodb.ServerAddress; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; @@ -70,22 +68,8 @@ public MongoDBKnowUriFilter(String hostName, Integer port) { LOGGER.info("Filter Persistance: " + PERSIST); - MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + this.client = MongoDBConnectionFactory.getConnection(hostName, port); - if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null - && mongoConfiguration.getSocketTimeout() != null && mongoConfiguration.getServerTimeout() != null)) { - optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); - optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); - optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); - - MongoClientOptions options = optionsBuilder.build(); - - client = new MongoClient(new ServerAddress(hostName, port), options); - - } else { - client = new MongoClient(hostName, port); - } } @Override diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java new file mode 100644 index 000000000..87afceea4 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java @@ -0,0 +1,44 @@ +package org.dice_research.squirrel.mongodb; + +import org.dice_research.squirrel.configurator.MongoConfiguration; + +import com.mongodb.MongoClient; +import com.mongodb.MongoClientOptions; +import com.mongodb.ServerAddress; + +/** + * A class that returns a connection with mongodb + * + * @author Geraldo de Souza Junior (gsjunior@mail.uni-paderborn.de) + * + */ + +public class MongoDBConnectionFactory { + + /** + * Returns a MongoClient based on host and port + * + * @param hostName the mongodb host + * @param port the mongodb post + * @return a new MongoClient + */ + public static MongoClient getConnection(String hostName, Integer port) { + MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); + MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); + + if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null + && mongoConfiguration.getServerTimeout() != null)) { + optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); + optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); + optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); + + MongoClientOptions options = optionsBuilder.build(); + + return new MongoClient(new ServerAddress(hostName, port), options); + + } else { + return new MongoClient(hostName, port); + } + } + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java index c3fedf869..4dc388032 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java @@ -11,18 +11,16 @@ import org.bson.Document; import org.bson.types.Binary; import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.configurator.MongoConfiguration; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; +import org.dice_research.squirrel.mongodb.MongoDBConnectionFactory; import org.dice_research.squirrel.queue.AbstractIpAddressBasedQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.mongodb.MongoClient; -import com.mongodb.MongoClientOptions; import com.mongodb.MongoWriteException; -import com.mongodb.ServerAddress; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; @@ -65,22 +63,7 @@ public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer, this.serializer = serializer; - MongoClientOptions.Builder optionsBuilder = MongoClientOptions.builder(); - MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration(); - - if (mongoConfiguration != null && (mongoConfiguration.getConnectionTimeout() != null && mongoConfiguration.getSocketTimeout() != null - && mongoConfiguration.getServerTimeout() != null)) { - optionsBuilder.connectTimeout(mongoConfiguration.getConnectionTimeout()); - optionsBuilder.socketTimeout(mongoConfiguration.getSocketTimeout()); - optionsBuilder.serverSelectionTimeout(mongoConfiguration.getServerTimeout()); - - MongoClientOptions options = optionsBuilder.build(); - - client = new MongoClient(new ServerAddress(hostName, port), options); - - } else { - client = new MongoClient(hostName, port); - } + this.client = MongoDBConnectionFactory.getConnection(hostName, port); } From 8f2ee44838522f82160c1ee56adf878550fb3d3d Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 15:25:04 +0200 Subject: [PATCH 090/102] codacy quality fixes --- .../uri/CrawleableUriSerializationTest.java | 12 +- .../data/uri/info/RDBURIReferences.java | 8 +- .../data/uri/norm/NormalizerImpl.java | 399 +++++++++--------- .../squirrel/log/DomainLogger.java | 10 +- .../generator/impl/CkanSeedGeneratorImpl.java | 17 +- .../squirrel/model/RDBConnectorTest.java | 25 +- .../simulation/ScenarioBasedTest.java | 4 +- .../src/main/java/com/SquirrelWebObject.java | 26 +- .../impl/html/scraper/HtmlScraper.java | 4 +- .../analyzer/impl/html/scraper/YamlFile.java | 3 - .../RobotsManagerConfiguration.java | 1 + .../fetcher/ftp/FTPRecursiveFetcher.java | 10 +- .../analyzer/impl/MicrodataParserTest.java | 12 +- .../analyzer/impl/RDFaParserTest.java | 18 +- 14 files changed, 260 insertions(+), 289 deletions(-) diff --git a/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java b/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java index fc0f8437d..1626cca24 100644 --- a/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java +++ b/squirrel.api/src/test/java/org/aksw/simba/squirrel/data/uri/CrawleableUriSerializationTest.java @@ -1,19 +1,11 @@ package org.aksw.simba.squirrel.data.uri; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.Serializable; import java.net.InetAddress; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; -import java.util.Base64; import java.util.Collection; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -27,10 +19,8 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import org.xerial.snappy.Snappy; - -import com.google.gson.Gson; +@SuppressWarnings("deprecation") @RunWith(Parameterized.class) public class CrawleableUriSerializationTest { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/info/RDBURIReferences.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/info/RDBURIReferences.java index 55f164689..7cae49e2e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/info/RDBURIReferences.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/info/RDBURIReferences.java @@ -30,10 +30,10 @@ public class RDBURIReferences implements URIReferences, Closeable { private URIReferencesUtils utils; // Some constants for the rethinkDB - static final String DATABASE_NAME = "squirrel"; - static final String TABLE_NAME = "uriReferences"; - static final String COLUMN_URI = "uri"; - static final String COLUMN_FOUNDURIS = "foundUris"; + public static final String DATABASE_NAME = "squirrel"; + public static final String TABLE_NAME = "uriReferences"; + public static final String COLUMN_URI = "uri"; + public static final String COLUMN_FOUNDURIS = "foundUris"; /** * Constructor. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java index 7503ba5f5..71bf759bc 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java @@ -35,207 +35,202 @@ public class NormalizerImpl implements UriNormalizer { - private static final Logger LOGGER = LoggerFactory.getLogger(NormalizerImpl.class); - - /** - * Nutch 1098 - finds URL encoded parts of the URL - */ - private final static Pattern UNESCAPE_RULE_PATTERN = Pattern.compile("%([0-9A-Fa-f]{2})"); - /** - * look-up table for characters which should not be escaped in URL paths - */ - private final static BitSet UNESCAPED_CHARS = new BitSet(0x7F); - private final List sessionIDs; - private final Map defaultPortMap; - - public NormalizerImpl(List sessionIDs, Map defaultPortMap) { - this.sessionIDs = sessionIDs; - this.defaultPortMap = defaultPortMap; - } - - static { - /* - * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency, - * percent-encoded octets in the ranges of ALPHA (%41-%5A and %61-%7A), DIGIT - * (%30-%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E) - * should not be created by URI producers and, when found in a URI, should be - * decoded to their corresponding unreserved characters by URI normalizers. - */ - UNESCAPED_CHARS.set(0x2D, 0x2E); - UNESCAPED_CHARS.set(0x30, 0x39); - UNESCAPED_CHARS.set(0x41, 0x5A); - UNESCAPED_CHARS.set(0x61, 0x7A); - UNESCAPED_CHARS.set(0x5F); - UNESCAPED_CHARS.set(0x7E); - } - - - @Override - public CrawleableUri normalize(CrawleableUri uri) { - URI uriObject = uri.getUri(); - boolean changed = false; - // normalize path - String path = uriObject.getRawPath(); - if (path != null) { - String temp = normalizePath(path); - if (temp != path) { - path = temp; - changed = true; - } - } - - - // Copy Normalization from - // https://github.com/crawler-commons/crawler-commons/blob/master/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java - // OR use URI.normalize() - - // Check whether the query part of a URI has to be sorted - String query = uriObject.getQuery(); - if(query != null){ - if(query.length() > 0) { - String[] queryList = query.split("&"); - Arrays.sort(queryList); - List queries = new ArrayList<>(Arrays.asList(queryList)); - List toRemove = new ArrayList<>(); - for(String queryParameter : queries){ - //removing session ids - if(sessionIDs.contains(queryParameter.split("=")[0].toLowerCase())){ - toRemove.add(queryParameter); - } - } - queries.removeAll(toRemove); - String newQuery = String.join("&", queries); - if(!query.equals(newQuery)) { - query = newQuery; - changed = true; - - } - } - else{ - query = null; - changed = true; - } - } - - //Remove default ports - int port = uriObject.getPort(); - String scheme = uriObject.getScheme() != null ? uriObject.getScheme() : ""; - - if(port != -1){ - if(defaultPortMap.containsKey(scheme)){ - if(port == defaultPortMap.get(scheme)){ - port = -1; - changed = true; - } - } - } - // Filter fragments (i.e., delete them) - String fragment = uriObject.getFragment(); - if ((fragment != null) && (fragment.length() > 0)) { - changed = true; - } - - // convert host and scheme to lower case - String host = uriObject.getHost() != null ? uriObject.getHost() : ""; - String lowerCaseHost = host != null ? host.toLowerCase() : ""; - String lowerCaseScheme = scheme != null ? scheme.toLowerCase() : ""; - - if(!scheme.equals(lowerCaseScheme) || !host.equals(lowerCaseHost)){ - scheme = lowerCaseScheme; - host = lowerCaseHost; - changed = true; - } - - // Filter attributes of the URI - // uriObject.getQuery(); - - if (changed) { - // create new URI object; - URIBuilder builder = new URIBuilder(uriObject); - builder.setFragment(null); - builder.setPath(path); - builder.setCustomQuery(query); - builder.setPort(port); - builder.setHost(host); - builder.setScheme(scheme); - CrawleableUri normalizedUri = null; - - try { - normalizedUri = new CrawleableUri(builder.build()); - normalizedUri.setData(uri.getData()); - } catch (URISyntaxException e) { - LOGGER.error("Exception while normalizing URI. Returning original URI.", e); - return uri; - } - return normalizedUri; - } - else - return uri; - - } - - /** - * Path normalization adapted from the {@link URI} class (which is based upon - * src/solaris/native/java/io/canonicalize_md.c) and the Crawler - * Commons project. - * - * @param path - * @return the normalized path or the given path object if no changes have been - * made. - */ - public String normalizePath(String path) { - // Check for empty paths - if(path.equals("")){ - path = "/"; - return path; - } - // Check for encoded parts - Matcher matcher = UNESCAPE_RULE_PATTERN.matcher(path); - StringBuffer changedPath = null; - if (matcher.find()) { - changedPath = new StringBuffer(""); - int hex, pos = 0; - do { - changedPath.append(path.substring(pos, matcher.start())); - pos = matcher.start(); - hex = getHexValue(path.charAt(pos + 1), path.charAt(pos + 2)); - // If this character shouldn't be escaped - if (UNESCAPED_CHARS.get(hex)) { - changedPath.append((char) hex); - } else { - changedPath.append(path.substring(pos, pos + 3)); - } - pos += 3; - } while (matcher.find()); - if (pos < path.length()) { - changedPath.append(path.substring(pos)); - } - } - if (changedPath == null) { - return PathNormalization.normalize(path); - } else { - String newPath = changedPath.toString(); - return PathNormalization.normalize(newPath.equals(path) ? path : newPath); - } - } - - protected static int getHexValue(char c1, char c2) { - int hex; - if (c1 <= 0x39) { - hex = c1 - 0x30; - } else { - // Check whether it is A-F or a-f - hex = (c1 <= 0x46) ? (c1 - 0x37) : (c1 - 0x57); - } - hex <<= 4; - if (c2 <= 0x39) { - hex |= c2 - 0x30; - } else { - // Check whether it is A-F or a-f - hex |= (c2 <= 0x46) ? (c2 - 0x37) : (c2 - 0x57); - } - return hex; - } + private static final Logger LOGGER = LoggerFactory.getLogger(NormalizerImpl.class); + + /** + * Nutch 1098 - finds URL encoded parts of the URL + */ + private final static Pattern UNESCAPE_RULE_PATTERN = Pattern.compile("%([0-9A-Fa-f]{2})"); + /** + * look-up table for characters which should not be escaped in URL paths + */ + private final static BitSet UNESCAPED_CHARS = new BitSet(0x7F); + private final List sessionIDs; + private final Map defaultPortMap; + + public NormalizerImpl(List sessionIDs, Map defaultPortMap) { + this.sessionIDs = sessionIDs; + this.defaultPortMap = defaultPortMap; + } + + static { + /* + * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency, + * percent-encoded octets in the ranges of ALPHA (%41-%5A and %61-%7A), DIGIT + * (%30-%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E) + * should not be created by URI producers and, when found in a URI, should be + * decoded to their corresponding unreserved characters by URI normalizers. + */ + UNESCAPED_CHARS.set(0x2D, 0x2E); + UNESCAPED_CHARS.set(0x30, 0x39); + UNESCAPED_CHARS.set(0x41, 0x5A); + UNESCAPED_CHARS.set(0x61, 0x7A); + UNESCAPED_CHARS.set(0x5F); + UNESCAPED_CHARS.set(0x7E); + } + + @Override + public CrawleableUri normalize(CrawleableUri uri) { + URI uriObject = uri.getUri(); + boolean changed = false; + // normalize path + String path = uriObject.getRawPath(); + if (path != null) { + String temp = normalizePath(path); + if (temp != path) { + path = temp; + changed = true; + } + } + + // Copy Normalization from + // https://github.com/crawler-commons/crawler-commons/blob/master/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java + // OR use URI.normalize() + + // Check whether the query part of a URI has to be sorted + String query = uriObject.getQuery(); + if (query != null) { + if (query.length() > 0) { + String[] queryList = query.split("&"); + Arrays.sort(queryList); + List queries = new ArrayList<>(Arrays.asList(queryList)); + List toRemove = new ArrayList<>(); + for (String queryParameter : queries) { + // removing session ids + if (sessionIDs.contains(queryParameter.split("=")[0].toLowerCase())) { + toRemove.add(queryParameter); + } + } + queries.removeAll(toRemove); + String newQuery = String.join("&", queries); + if (!query.equals(newQuery)) { + query = newQuery; + changed = true; + + } + } else { + query = null; + changed = true; + } + } + + // Remove default ports + int port = uriObject.getPort(); + String scheme = uriObject.getScheme() != null ? uriObject.getScheme() : ""; + + if (port != -1 && defaultPortMap.containsKey(scheme) && port == defaultPortMap.get(scheme)) { + + port = -1; + changed = true; + + } + // Filter fragments (i.e., delete them) + String fragment = uriObject.getFragment(); + if ((fragment != null) && (fragment.length() > 0)) { + changed = true; + } + + // convert host and scheme to lower case + String host = uriObject.getHost() != null ? uriObject.getHost() : ""; + String lowerCaseHost = host != null ? host.toLowerCase() : ""; + String lowerCaseScheme = scheme != null ? scheme.toLowerCase() : ""; + + if (!scheme.equals(lowerCaseScheme) || !host.equals(lowerCaseHost)) { + scheme = lowerCaseScheme; + host = lowerCaseHost; + changed = true; + } + + // Filter attributes of the URI + // uriObject.getQuery(); + + if (changed) { + // create new URI object; + URIBuilder builder = new URIBuilder(uriObject); + builder.setFragment(null); + builder.setPath(path); + builder.setCustomQuery(query); + builder.setPort(port); + builder.setHost(host); + builder.setScheme(scheme); + CrawleableUri normalizedUri = null; + + try { + normalizedUri = new CrawleableUri(builder.build()); + normalizedUri.setData(uri.getData()); + } catch (URISyntaxException e) { + LOGGER.error("Exception while normalizing URI. Returning original URI.", e); + return uri; + } + return normalizedUri; + } else + return uri; + + } + + /** + * Path normalization adapted from the {@link URI} class (which is based upon + * src/solaris/native/java/io/canonicalize_md.c) and the Crawler + * Commons project. + * + * @param path + * @return the normalized path or the given path object if no changes have been + * made. + */ + public String normalizePath(String path) { + // Check for empty paths + if (path.equals("")) { + path = "/"; + return path; + } + // Check for encoded parts + Matcher matcher = UNESCAPE_RULE_PATTERN.matcher(path); + StringBuffer changedPath = null; + if (matcher.find()) { + changedPath = new StringBuffer(""); + int hex = 0; + int pos = 0; + do { + changedPath.append(path.substring(pos, matcher.start())); + pos = matcher.start(); + hex = getHexValue(path.charAt(pos + 1), path.charAt(pos + 2)); + // If this character shouldn't be escaped + if (UNESCAPED_CHARS.get(hex)) { + changedPath.append((char) hex); + } else { + changedPath.append(path.substring(pos, pos + 3)); + } + pos += 3; + } while (matcher.find()); + if (pos < path.length()) { + changedPath.append(path.substring(pos)); + } + } + if (changedPath == null) { + return PathNormalization.normalize(path); + } else { + String newPath = changedPath.toString(); + return PathNormalization.normalize(newPath.equals(path) ? path : newPath); + } + } + + protected static int getHexValue(char c1, char c2) { + int hex; + if (c1 <= 0x39) { + hex = c1 - 0x30; + } else { + // Check whether it is A-F or a-f + hex = (c1 <= 0x46) ? (c1 - 0x37) : (c1 - 0x57); + } + hex <<= 4; + if (c2 <= 0x39) { + hex |= c2 - 0x30; + } else { + // Check whether it is A-F or a-f + hex |= (c2 <= 0x46) ? (c2 - 0x37) : (c2 - 0x57); + } + return hex; + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/log/DomainLogger.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/log/DomainLogger.java index 4cb4b2479..5d8fdbb22 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/log/DomainLogger.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/log/DomainLogger.java @@ -17,6 +17,11 @@ public class DomainLogger implements Iterator, Closeable { private static final Logger LOGGER = LoggerFactory.getLogger(DomainLogger.class); + + protected String domain; + protected FileWriter writer; + protected Set domainCache = new HashSet(); + protected Iterator iterator; public static Iterator createIfPossible(CrawleableUri uri, String logFile, Iterator iterator) { if(logFile == null) { @@ -39,10 +44,7 @@ public static DomainLogger create(CrawleableUri uri, String logFile, Iterator domainCache = new HashSet(); - protected Iterator iterator; + public DomainLogger(CrawleableUri uri, FileWriter writer, Iterator iterator) { this.domain = uri.getUri().getAuthority(); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImpl.java index 309df9125..9f07af704 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImpl.java @@ -1,5 +1,9 @@ package org.dice_research.squirrel.seed.generator.impl; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; @@ -14,21 +18,16 @@ import org.json.JSONObject; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.lang.reflect.Array; -import java.util.ArrayList; -import java.util.List; - /** * Created by ivan on 25.02.16. */ public class CkanSeedGeneratorImpl extends AbstractSeedGenerator { private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(AbstractSeedGenerator.class); - static final String CkanApiEndpoint = "http://datahub.io/api/3"; - static final String RdfSearchAction = "/action/package_search?q=rdf"; - static final String RdfSearchQuery = CkanApiEndpoint + RdfSearchAction; - static final String RdfCountQuery = RdfSearchQuery + "&rows=1"; + public static final String CkanApiEndpoint = "http://datahub.io/api/3"; + public static final String RdfSearchAction = "/action/package_search?q=rdf"; + public static final String RdfSearchQuery = CkanApiEndpoint + RdfSearchAction; + public static final String RdfCountQuery = RdfSearchQuery + "&rows=1"; public CkanSeedGeneratorImpl(Frontier frontier) { super(frontier); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/model/RDBConnectorTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/model/RDBConnectorTest.java index 4ddb27b91..cc2c5bc69 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/model/RDBConnectorTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/model/RDBConnectorTest.java @@ -1,31 +1,24 @@ package org.dice_research.squirrel.model; -import com.rethinkdb.RethinkDB; -import com.rethinkdb.gen.ast.Map; -import com.rethinkdb.gen.exc.ReqlDriverError; -import com.rethinkdb.model.MapObject; -import com.rethinkdb.net.Connection; -import com.rethinkdb.net.Cursor; +import java.io.IOException; +import java.net.InetAddress; +import java.net.URI; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; import org.dice_research.squirrel.RethinkDBBasedTest; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.CrawleableUriFactory4Tests; import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.UriUtils; -import org.dice_research.squirrel.model.RDBConnector; import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.InetAddress; -import java.net.URI; -import java.net.UnknownHostException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; +import com.rethinkdb.model.MapObject; +import com.rethinkdb.net.Cursor; /** * Created by ivan on 8/18/16. diff --git a/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java b/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java index 2bee4be89..30d31024f 100644 --- a/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java +++ b/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java @@ -45,7 +45,9 @@ public static Collection data() throws Exception { String server2Url = "http://127.0.0.2:" + SERVER_PORT; String server3Url = "http://127.0.0.3:" + SERVER_PORT; - Model model1, model2, model3; + Model model1; + Model model2; + Model model3; /* * Simple scenario in which resource1 is the seed and points to resource2 which * points to resource3. diff --git a/squirrel.web-api/src/main/java/com/SquirrelWebObject.java b/squirrel.web-api/src/main/java/com/SquirrelWebObject.java index 7392be6ce..a02557f08 100644 --- a/squirrel.web-api/src/main/java/com/SquirrelWebObject.java +++ b/squirrel.web-api/src/main/java/com/SquirrelWebObject.java @@ -55,7 +55,7 @@ public SquirrelWebObject() { SquirrelWebObject.IDCOUNTER++; } - private String ListToString(List list) { + private String listToString(List list) { final StringBuilder ret = new StringBuilder(""); if (list != null) list.forEach(e -> ret.append("").append(e).append("")); @@ -63,10 +63,10 @@ private String ListToString(List list) { return ret.toString(); } - private String MapToString(Map> map) { + private String mapToString(Map> map) { StringBuilder ret = new StringBuilder(""); if (map != null) { - map.forEach((k, v) -> ret.append("").append(k).append("").append(ListToString(v)).append("")); + map.forEach((k, v) -> ret.append("").append(k).append("").append(listToString(v)).append("")); } ret.append(""); @@ -74,7 +74,7 @@ private String MapToString(Map> map) { } @SuppressWarnings("unchecked") - private List StringToList(String string) { + private List stringToList(String string) { if (string == null) return Collections.EMPTY_LIST; @@ -106,7 +106,7 @@ private List StringToList(String string) { } @SuppressWarnings("unchecked") - private Map> StringToMap(String string) { + private Map> stringToMap(String string) { if (string == null) return Collections.EMPTY_MAP; @@ -125,7 +125,7 @@ private Map> StringToMap(String string) { for (int i = 0; i < string.length(); i++) { if (string.charAt(i) == '>' && i > 3) { if (string.startsWith(" getPendingURIs() { - List ret = StringToList(pendingURIs); + List ret = stringToList(pendingURIs); List error = isReadable(ret); if (error == null) { return ret; @@ -239,7 +239,7 @@ public List getPendingURIs() { } public int getCountOfPendingURIs() { - List ret = StringToList(pendingURIs); + List ret = stringToList(pendingURIs); List error = isReadable(ret); if (error == null) { return ret.size(); @@ -249,7 +249,7 @@ public int getCountOfPendingURIs() { } public List getNextCrawledURIs() { - List ret = StringToList(nextCrawledURIs); + List ret = stringToList(nextCrawledURIs); List error = isReadable(ret); if (error == null) { return ret; @@ -269,7 +269,7 @@ public List getNextCrawledURIs() { // } public Map> getIpStringListMap() { - Map> ret = StringToMap(IPMapPendingURis); + Map> ret = stringToMap(IPMapPendingURis); Map> error = isReadable(ret); if (error == null) { @@ -339,12 +339,12 @@ private void isWritable() throws IllegalAccessException { public void setPendingURIs(List pendingURIs) throws IllegalAccessException { isWritable(); - this.pendingURIs = ListToString(pendingURIs); + this.pendingURIs = listToString(pendingURIs); } public void setIPMapPendingURis(Map> IPMapPendingURis) throws IllegalAccessException { isWritable(); - this.IPMapPendingURis = MapToString(IPMapPendingURis); + this.IPMapPendingURis = mapToString(IPMapPendingURis); } // public void setCrawledURIs(List crawledURIs) throws IllegalAccessException { @@ -369,7 +369,7 @@ public void setCountOfCrawledURIs(int countOfCrawledURIs) throws IllegalAccessEx public void setNextCrawledURIs(List nextCrawledURIs) throws IllegalAccessException { isWritable(); - this.nextCrawledURIs = ListToString(nextCrawledURIs); + this.nextCrawledURIs = listToString(nextCrawledURIs); } public void setRuntimeInSeconds(long runtimeInSeconds) throws IllegalAccessException { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java index 5b17736e8..3d02beae6 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java @@ -142,7 +142,7 @@ public List scrape(String uri, File filetToScrape) throws Exception { if (m.find()) { Map resources = (Map) entry.getValue() .get(YamlFileAtributes.RESOURCES); - listTriples.addAll(scrapeDownloadLink(resources, filetToScrape, uri)); + listTriples.addAll(scrapeDownloadLink(resources, filetToScrape)); break; } } catch (Exception e) { @@ -202,7 +202,7 @@ private List updateRelationship(List listTriples) { } @SuppressWarnings("unchecked") - private Set scrapeDownloadLink(Map resources, File htmlFile, String uri) throws Exception { + private Set scrapeDownloadLink(Map resources, File htmlFile) throws Exception { this.doc = Jsoup.parse(htmlFile, "UTF-8"); Set triples = new LinkedHashSet(); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFile.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFile.java index 656174734..aac350e52 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFile.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFile.java @@ -7,9 +7,6 @@ */ public class YamlFile implements Cloneable { - protected YamlFile() { - - } private Map> file_descriptor; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/configurator/RobotsManagerConfiguration.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/configurator/RobotsManagerConfiguration.java index dcfdbaee6..c98a36468 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/configurator/RobotsManagerConfiguration.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/configurator/RobotsManagerConfiguration.java @@ -22,6 +22,7 @@ public static RobotsManagerConfiguration getRobotsManagerConfiguration() throws } else { String msg = "Couldn't get " + MIN_DELAY_KEY + " from the environment. " + "The RobotsManager will use default minimum delay parameter."; + LOGGER.info(msg); return null; } } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/ftp/FTPRecursiveFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/ftp/FTPRecursiveFetcher.java index 261d17102..934d802e7 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/ftp/FTPRecursiveFetcher.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/ftp/FTPRecursiveFetcher.java @@ -1,23 +1,21 @@ package org.dice_research.squirrel.fetcher.ftp; -import org.apache.commons.net.ftp.FTPClient; -import org.apache.commons.net.ftp.FTPFile; - import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; import java.nio.file.Path; +import org.apache.commons.net.ftp.FTPClient; +import org.apache.commons.net.ftp.FTPFile; + public class FTPRecursiveFetcher { - private OutputStream output; private Path path; protected FTPRecursiveFetcher(Path path) throws IOException { this.path = path; - output = new FileOutputStream(File.createTempFile("fetched_", "", path.toFile())); + //OutputStream output = new FileOutputStream(File.createTempFile("fetched_", "", path.toFile())); } diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java index 7dacbed4d..6ff50b203 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java @@ -49,6 +49,12 @@ public class MicrodataParserTest extends RDFParserTest { ClassLoader classLoader = getClass().getClassLoader(); public static Map> testresults = new HashMap>(); + @Parameter(0) + public String testData; + @Parameter(1) + public String resultData; + @Rule public TestName test = new TestName(); + // static double[] truepositiv = new double[data().size()]; // static double[] falsenegativ = new double[data().size()]; // static double[] falsepositiv = new double[data().size()]; @@ -57,11 +63,7 @@ public class MicrodataParserTest extends RDFParserTest { public static void initialization () throws URISyntaxException { } - @Parameter(0) - public String testData; - @Parameter(1) - public String resultData; - @Rule public TestName test = new TestName(); + @Parameters(name = "{index},{0},{1}") diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFaParserTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFaParserTest.java index a14d7cf1a..193a3a6b1 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFaParserTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/RDFaParserTest.java @@ -25,7 +25,6 @@ import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer; import org.dice_research.squirrel.sink.impl.mem.InMemorySink; import org.junit.AfterClass; -import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; @@ -40,8 +39,6 @@ public class RDFaParserTest extends RDFParserTest { private static String context = "http://rdfa.info/test-suite/test-cases/rdfa1.1/"; - private static String pathextension = "html_scraper_analyzer\\RDFaParserTestResources\\"; - private static String pathextensiontestsuit = "html_scraper_analyzer\\RDFaParserTestResources\\TestSuit\\"; private static String pathextensionsuit = "html_scraper_analyzer/RDFaParserTestResources/rdfa1.1/"; private static String pathextensionhtml4 = "html4/"; private static String pathextensionhtml5 = "html5/"; @@ -53,27 +50,22 @@ public class RDFaParserTest extends RDFParserTest { private static String pathextensionxml = "xml/"; private static Analyzer analyzer1; - private static Analyzer analyzer2; private UriCollector collector = new SimpleUriCollector(new GzipJavaUriSerializer()); private CrawleableUri curi; private static InMemorySink sink; ClassLoader classLoader = getClass().getClassLoader(); public static Map> testresults = new HashMap>(); -// static double[] truepositiv = new double[data().size()]; -// static double[] falsenegativ = new double[data().size()]; -// static double[] falsepositiv = new double[data().size()]; - - @BeforeClass - public static void initialization () throws URISyntaxException { - } - @Parameter(0) public String testData; @Parameter(1) public String resultData; @Rule public TestName test = new TestName(); - + +// static double[] truepositiv = new double[data().size()]; +// static double[] falsenegativ = new double[data().size()]; +// static double[] falsepositiv = new double[data().size()]; + @Parameters(name = "{index},{0},{1}") public static Collection data() { From 4c51893f0468632aa4048d09a1ceca1a7d4d761b Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 15:36:41 +0200 Subject: [PATCH 091/102] codacy fix --- .../hashing/impl/ArrayHashValue.java | 160 +++++++++--------- .../data/uri/filter/MongoDBKnowUriFilter.java | 4 +- ...rator.java => Frontierquerygenerator.java} | 2 +- .../SparqlBasedOutDatedUriRetriever.java | 2 +- ...ory.java => Mongodbconnectionfactory.java} | 2 +- .../queue/ipbased/MongoDBIpBasedQueue.java | 4 +- .../frontier/impl/RecrawlingTest.java | 4 +- 7 files changed, 89 insertions(+), 89 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/{FrontierQueryGenerator.java => Frontierquerygenerator.java} (97%) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/{MongoDBConnectionFactory.java => Mongodbconnectionfactory.java} (97%) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/impl/ArrayHashValue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/impl/ArrayHashValue.java index 837b4c3a3..3f5b6b81f 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/impl/ArrayHashValue.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/impl/ArrayHashValue.java @@ -6,94 +6,94 @@ * A hash value as a Array of integers. */ public class ArrayHashValue implements HashValue { - /** - * - */ - private static final long serialVersionUID = 1L; + /** + * + */ + private static final long serialVersionUID = 1L; - /** - * The Array of HashValues. - */ - private Integer[] hashValues; + /** + * The Array of HashValues. + */ + private Integer[] hashValues; - /** - * The delimeter between the individual HashValues - */ - private static final String DELIMETER = ","; + /** + * The delimeter between the individual HashValues + */ + private static final String DELIMETER = ","; - /** - * Constructor. - */ - public ArrayHashValue() { + /** + * Constructor. + */ + public ArrayHashValue() { - } + } - /** - * Constructor. - * - * @param hashValues The Array of Hash values. - */ - public ArrayHashValue(Integer[] hashValues) { - this.hashValues = hashValues; - } + /** + * Constructor. + * + * @param hashValues The Array of Hash values. + */ + public ArrayHashValue(Integer[] hashValues) { + this.hashValues = hashValues; + } - @Override - public String encodeToString() { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < hashValues.length; i++) { - sb.append(hashValues[i]); - if (i < hashValues.length - 1) { - sb.append(DELIMETER); - } - } - return sb.toString(); - } + @Override + public String encodeToString() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < hashValues.length; i++) { + sb.append(hashValues[i]); + if (i < hashValues.length - 1) { + sb.append(DELIMETER); + } + } + return sb.toString(); + } - @Override - public HashValue decodeFromString(String s) { - String[] array = s.split(DELIMETER); - Integer[] hashValues = new Integer[array.length]; - for (int i = 0; i < array.length; i++) { - hashValues[i] = Integer.parseInt(array[i]); - } - return new ArrayHashValue(hashValues); - } + @Override + public HashValue decodeFromString(String s) { + String[] array = s.split(DELIMETER); + Integer[] hashValues = new Integer[array.length]; + for (int i = 0; i < array.length; i++) { + hashValues[i] = Integer.parseInt(array[i]); + } + return new ArrayHashValue(hashValues); + } - @Override - public boolean equals(Object obj) { - if (obj instanceof ArrayHashValue) { - ArrayHashValue arrayHashValue = (ArrayHashValue) obj; - if (hashValues.length != arrayHashValue.hashValues.length) { - return false; - } - for (int i = 0; i < hashValues.length; i++) { - if (hashValues[i] == null ^ arrayHashValue.hashValues[i] == null) { - return false; - } else if (!(hashValues[i] == null && arrayHashValue.hashValues[i] == null)) { - if (!hashValues[i].equals(arrayHashValue.hashValues[i])) { - // in this case they are equal - return false; - } - } - } + @Override + public boolean equals(Object obj) { + if (obj instanceof ArrayHashValue) { + ArrayHashValue arrayHashValue = (ArrayHashValue) obj; + if (hashValues.length != arrayHashValue.hashValues.length) { + return false; + } + for (int i = 0; i < hashValues.length; i++) { + if (hashValues[i] == null ^ arrayHashValue.hashValues[i] == null) { + return false; + } else if (!(hashValues[i] == null && arrayHashValue.hashValues[i] == null) + && !hashValues[i].equals(arrayHashValue.hashValues[i])) { + // in this case they are equal + return false; - return true; - } - return false; - } + } + } - @Override - public String toString() { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("ArrayHashValue ["); - for (int i = 0; i < hashValues.length; i++) { - if (hashValues[i] != null) { - stringBuilder.append(hashValues[i] + DELIMETER); - } else { - stringBuilder.append("null" + DELIMETER); - } - } - stringBuilder.append("]"); - return stringBuilder.toString(); - } + return true; + } + return false; + } + + @Override + public String toString() { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("ArrayHashValue ["); + for (int i = 0; i < hashValues.length; i++) { + if (hashValues[i] != null) { + stringBuilder.append(hashValues[i] + DELIMETER); + } else { + stringBuilder.append("null" + DELIMETER); + } + } + stringBuilder.append("]"); + return stringBuilder.toString(); + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 04d561452..103841ba1 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -18,7 +18,7 @@ import org.dice_research.squirrel.deduplication.hashing.HashValue; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.impl.FrontierImpl; -import org.dice_research.squirrel.mongodb.MongoDBConnectionFactory; +import org.dice_research.squirrel.mongodb.Mongodbconnectionfactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +68,7 @@ public MongoDBKnowUriFilter(String hostName, Integer port) { LOGGER.info("Filter Persistance: " + PERSIST); - this.client = MongoDBConnectionFactory.getConnection(hostName, port); + this.client = Mongodbconnectionfactory.getConnection(hostName, port); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java similarity index 97% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java index cc5727af1..5c9eb3b81 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java @@ -6,7 +6,7 @@ import java.text.SimpleDateFormat; import java.util.Calendar; -public class FrontierQueryGenerator { +public class Frontierquerygenerator { /** * Return outdated uris by comparing their endtime stamps. * @return All triples with time stamp in the default graph. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index 2fb595e0b..a898ae79f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -93,7 +93,7 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { public List getUriToRecrawl() { Calendar date = Calendar.getInstance(); date.add(Calendar.DAY_OF_YEAR, 7); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); + Query getOutdatedUrisQuery = Frontierquerygenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); while (rs.hasNext()) { diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java similarity index 97% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java index 87afceea4..cb1f97c26 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongoDBConnectionFactory.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java @@ -13,7 +13,7 @@ * */ -public class MongoDBConnectionFactory { +public class Mongodbconnectionfactory { /** * Returns a MongoClient based on host and port diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java index 4dc388032..df0cea0f8 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java @@ -14,7 +14,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; -import org.dice_research.squirrel.mongodb.MongoDBConnectionFactory; +import org.dice_research.squirrel.mongodb.Mongodbconnectionfactory; import org.dice_research.squirrel.queue.AbstractIpAddressBasedQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,7 +63,7 @@ public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer, this.serializer = serializer; - this.client = MongoDBConnectionFactory.getConnection(hostName, port); + this.client = Mongodbconnectionfactory.getConnection(hostName, port); } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 3955bf904..587064259 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -16,7 +16,7 @@ import org.apache.jena.query.ResultSet; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; -import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; +import org.dice_research.squirrel.frontier.recrawling.Frontierquerygenerator; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,7 +37,7 @@ public void recrawling(){ date.set(Calendar.MONTH, Calendar.JANUARY); date.set(Calendar.DAY_OF_MONTH, 3); date.set(Calendar.YEAR, 2020); - Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); + Query getOutdatedUrisQuery = Frontierquerygenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); assertTrue("There should be at least one result", rs.hasNext()); From 458c306b68c776778d2bdaac36aafb18b46dbd3e Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 15:45:04 +0200 Subject: [PATCH 092/102] removed unused imports --- .../squirrel/sink/impl/mem/InMemorySink.java | 21 ++++++++++----- .../squirrel/uri/processing/UriProcessor.java | 2 +- .../uri/filter/MongoDBKnownUriFilterTest.java | 27 ++++++------------- .../sink/impl/hdt/HdtBasedSinkTest.java | 4 +-- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/impl/mem/InMemorySink.java b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/impl/mem/InMemorySink.java index ab6770ffd..67743948d 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/impl/mem/InMemorySink.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/impl/mem/InMemorySink.java @@ -1,12 +1,22 @@ package org.dice_research.squirrel.sink.impl.mem; -import com.google.common.collect.Sets; -import com.google.common.collect.Sets.SetView; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; -import org.apache.jena.rdf.model.*; +import org.apache.jena.rdf.model.AnonId; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Property; +import org.apache.jena.rdf.model.Resource; import org.apache.jena.sparql.core.Quad; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; @@ -14,9 +24,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.InputStream; -import java.util.*; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; /** * This is a simple in-memory implementation of a sink that can be used for diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/uri/processing/UriProcessor.java b/squirrel.api/src/main/java/org/dice_research/squirrel/uri/processing/UriProcessor.java index ed5083af2..c2addb39d 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/uri/processing/UriProcessor.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/uri/processing/UriProcessor.java @@ -61,7 +61,7 @@ private boolean isStringMatchRegexps(String string, String[] regexs) { public CrawleableUri recognizeInetAddress(CrawleableUri uri) throws UnknownHostException { String host; InetAddress ipAddress; - if (!(uri.getUri() == null)) { + if (uri.getUri() != null) { host = uri.getUri().getHost(); ipAddress = InetAddress.getByName(host); uri.setIpAddress(ipAddress); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnownUriFilterTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnownUriFilterTest.java index 8b613158e..f0cf361b9 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnownUriFilterTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnownUriFilterTest.java @@ -1,31 +1,21 @@ package org.dice_research.squirrel.data.uri.filter; -import com.mongodb.MongoClient; -import com.mongodb.client.MongoDatabase; -import com.mongodb.client.model.Indexes; -import com.rethinkdb.RethinkDB; -import com.rethinkdb.net.Cursor; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.util.List; import org.dice_research.squirrel.MongoDBBasedTest; -import org.dice_research.squirrel.MongoDBMockTest; -import org.dice_research.squirrel.RethinkDBBasedTest; -import org.dice_research.squirrel.RethinkDBMockTest; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.filter.RDBKnownUriFilter; -import org.dice_research.squirrel.frontier.impl.FrontierImpl; -import org.dice_research.squirrel.model.RDBConnector; -import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; -import java.io.IOException; -import java.net.InetAddress; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.UnknownHostException; -import java.util.List; +import com.mongodb.MongoClient; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.model.Indexes; /** @@ -43,7 +33,6 @@ public class MongoDBKnownUriFilterTest { /** * For functionality regarding the starting of rethinkdb container. */ - private MongoDBBasedTest mongoDBBasedTest; private MongoClient client; @Before diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSinkTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSinkTest.java index b0e616158..3fc05965f 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSinkTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSinkTest.java @@ -25,11 +25,9 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.UriUtils; import org.dice_research.squirrel.sink.Sink; -import org.dice_research.squirrel.sink.impl.hdt.HdtBasedSink; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDT; @@ -52,7 +50,7 @@ public class HdtBasedSinkTest { protected File tempDirectory = null; private Model[] models; private URI[] modelUris; - + @Before public void findTempDir() throws IOException, URISyntaxException { tempDirectory = File.createTempFile("HdtBasedSinkTest", ".tmp"); From 44085fa6857156cc587b38c7dce2a00d93d89d7f Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 16:07:23 +0200 Subject: [PATCH 093/102] removed unused classes and imports --- .../uri/filter/InMemoryKnownUriFilter.java | 6 +- .../uri/serialize/gson/GsonUriSerializer.java | 408 +++++++++--------- 2 files changed, 198 insertions(+), 216 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/InMemoryKnownUriFilter.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/InMemoryKnownUriFilter.java index 00dc10f37..2edad6c44 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/InMemoryKnownUriFilter.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/filter/InMemoryKnownUriFilter.java @@ -99,9 +99,9 @@ public long count() { private class UriInfo { - long lastCrawlTimestamp; - long nextCrawlTimestamp; - boolean crawlingInProcess; + private long lastCrawlTimestamp; + private long nextCrawlTimestamp; + private boolean crawlingInProcess; UriInfo(long lastCrawlTimestamp, long nextCrawlTimestamp, boolean crawlingInProcess) { this.lastCrawlTimestamp = lastCrawlTimestamp; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java index dfea4f9d1..e5d6ce49f 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java @@ -1,26 +1,11 @@ package org.dice_research.squirrel.data.uri.serialize.gson; -import java.io.IOException; -import java.net.InetAddress; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.HashMap; -import java.util.Map; - import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.serialize.Serializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.carrotsearch.hppc.ByteArrayList; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import com.google.gson.TypeAdapter; -import com.google.gson.stream.JsonReader; -import com.google.gson.stream.JsonToken; -import com.google.gson.stream.JsonWriter; /** * A serializer that uses {@link Gson} to serialize URIs. Kept for backwards @@ -31,9 +16,8 @@ */ public class GsonUriSerializer implements Serializer { - private static final Logger LOGGER = LoggerFactory.getLogger(GsonUriSerializer.class); +// private static final Logger LOGGER = LoggerFactory.getLogger(GsonUriSerializer.class); - private GsonBuilder builder; private Gson gson; public GsonUriSerializer() { @@ -41,9 +25,7 @@ public GsonUriSerializer() { } public GsonUriSerializer(GsonBuilder builder) { - this.builder = builder; - this.builder.registerTypeAdapter(CrawleableUri.class, new CrawleableUriAdapter()); - gson = this.builder.create(); + gson = builder.create(); } @Override @@ -57,197 +39,197 @@ public T deserialize(byte[] data) { return (T) gson.fromJson(new String(data, Constants.DEFAULT_CHARSET), CrawleableUri.class); } - private class CrawleableUriAdapter extends TypeAdapter { - - private static final String URI_KEY = "uri"; - private static final String ADDRESS_KEY = "address"; - private static final String ADDRESS_IP_KEY = "ip"; - private static final String ADDRESS_HOST_KEY = "host"; - private static final String URI_TYPE_KEY = "type"; - private static final String DATA_KEY = "data"; - private static final String DATA_NAME_KEY = "name"; - private static final String DATA_VALUE_KEY = "value"; - private static final String DATA_VALUE_TYPE_KEY = "type"; - - @SuppressWarnings("deprecation") - @Override - public void write(JsonWriter out, CrawleableUri uri) throws IOException { - out.beginObject(); - out.name(URI_KEY); - out.value(uri.getUri().toString()); - out.name(URI_TYPE_KEY); - out.value(uri.getType().name()); - if (uri.getIpAddress() != null) { - out.name(ADDRESS_KEY); - writeInetAddress(out, uri.getIpAddress()); - } - out.name(DATA_KEY); - out.beginArray(); - Map data = uri.getData(); - for (String key : data.keySet()) { - writeDataEntry(out, key, data.get(key)); - } - out.endArray(); - out.endObject(); - } - - private void writeInetAddress(JsonWriter out, InetAddress ipAddress) throws IOException { - out.beginObject(); - if (ipAddress.getHostName() != null) { - out.name(ADDRESS_HOST_KEY); - out.value(ipAddress.getHostName()); - } - out.name(ADDRESS_IP_KEY); - byte ip[] = ipAddress.getAddress(); - out.beginArray(); - for (int i = 0; i < ip.length; ++i) { - out.value(ip[i]); - } - out.endArray(); - out.endObject(); - } - - private void writeDataEntry(JsonWriter out, String key, Object value) throws IOException { - out.beginObject(); - out.name(DATA_NAME_KEY); - out.value(key); - out.name(DATA_VALUE_TYPE_KEY); - out.value(value.getClass().getCanonicalName()); - out.name(DATA_VALUE_KEY); - // use the gson instance of our surrounding class to serialize the - // object - out.jsonValue(gson.toJson(value)); - out.endObject(); - } - - @SuppressWarnings("deprecation") - @Override - public CrawleableUri read(JsonReader in) throws IOException { - in.beginObject(); - String uri = null; - String key; - InetAddress inetAddress = null; - UriType type = UriType.UNKNOWN; - Map data = new HashMap(); - while (in.peek() != JsonToken.END_OBJECT) { - key = in.nextName(); - switch (key) { - case URI_KEY: { - uri = in.nextString(); - break; - } - case URI_TYPE_KEY: { - type = UriType.valueOf(in.nextString()); - break; - } - case ADDRESS_KEY: { - inetAddress = readInetAddress(in); - break; - } - case DATA_KEY: { - in.beginArray(); - while (in.hasNext()) { - readDataObject(in, data); - } - in.endArray(); - break; - } - default: { - LOGGER.error( - "Got an unknown attribute name \"{}\" while parsing an CrawleableUri object. It will be ignored.", - key); - } - } - } - in.endObject(); - CrawleableUri result; - try { - result = new CrawleableUri(new URI(uri), inetAddress, type); - } catch (URISyntaxException e) { - throw new IOException(e); - } - result.setData(data); - return result; - } - - private InetAddress readInetAddress(JsonReader in) throws IOException { - in.beginObject(); - String host = null; - String key; - ByteArrayList ip = new ByteArrayList(); - while (in.peek() != JsonToken.END_OBJECT) { - key = in.nextName(); - switch (key) { - case ADDRESS_HOST_KEY: { - host = in.nextString(); - break; - } - case ADDRESS_IP_KEY: { - in.beginArray(); - while (in.hasNext()) { - ip.add((byte) in.nextLong()); - } - in.endArray(); - break; - } - default: { - LOGGER.error( - "Got an unknown attribute name \"{}\" while parsing an InetAddress object. It will be ignored.", - key); - } - } - } - in.endObject(); - if (host != null) { - return InetAddress.getByAddress(host, ip.toArray()); - } else { - return InetAddress.getByAddress(ip.toArray()); - } - } - - private void readDataObject(JsonReader in, Map data) throws IOException { - in.beginObject(); - String key; - String name = null; - String valueType = null; - Object value = null; - while (in.peek() != JsonToken.END_OBJECT) { - key = in.nextName(); - switch (key) { - case DATA_NAME_KEY: { - name = in.nextString(); - break; - } - case DATA_VALUE_TYPE_KEY: { - valueType = in.nextString(); - break; - } - case DATA_VALUE_KEY: { - if (valueType != null) { - try { - value = gson.fromJson(in, Class.forName(valueType)); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } else { - LOGGER.error( - "Couldn't read Object of {} because the value type was not defined before reading the value. It will be skipped.", - name); - in.skipValue(); - } - break; - } - default: { - LOGGER.error("Got an unknown attribute name \"{}\" while parsing an object. It will be ignored.", - key); - } - } - } - if ((name != null) && (value != null)) { - data.put(name, value); - } - in.endObject(); - } - - } +// private class CrawleableUriAdapter extends TypeAdapter { +// +// private static final String URI_KEY = "uri"; +// private static final String ADDRESS_KEY = "address"; +// private static final String ADDRESS_IP_KEY = "ip"; +// private static final String ADDRESS_HOST_KEY = "host"; +// private static final String URI_TYPE_KEY = "type"; +// private static final String DATA_KEY = "data"; +// private static final String DATA_NAME_KEY = "name"; +// private static final String DATA_VALUE_KEY = "value"; +// private static final String DATA_VALUE_TYPE_KEY = "type"; +// +// @SuppressWarnings("deprecation") +// @Override +// public void write(JsonWriter out, CrawleableUri uri) throws IOException { +// out.beginObject(); +// out.name(URI_KEY); +// out.value(uri.getUri().toString()); +// out.name(URI_TYPE_KEY); +// out.value(uri.getType().name()); +// if (uri.getIpAddress() != null) { +// out.name(ADDRESS_KEY); +// writeInetAddress(out, uri.getIpAddress()); +// } +// out.name(DATA_KEY); +// out.beginArray(); +// Map data = uri.getData(); +// for (String key : data.keySet()) { +// writeDataEntry(out, key, data.get(key)); +// } +// out.endArray(); +// out.endObject(); +// } +// +// private void writeInetAddress(JsonWriter out, InetAddress ipAddress) throws IOException { +// out.beginObject(); +// if (ipAddress.getHostName() != null) { +// out.name(ADDRESS_HOST_KEY); +// out.value(ipAddress.getHostName()); +// } +// out.name(ADDRESS_IP_KEY); +// byte ip[] = ipAddress.getAddress(); +// out.beginArray(); +// for (int i = 0; i < ip.length; ++i) { +// out.value(ip[i]); +// } +// out.endArray(); +// out.endObject(); +// } +// +// private void writeDataEntry(JsonWriter out, String key, Object value) throws IOException { +// out.beginObject(); +// out.name(DATA_NAME_KEY); +// out.value(key); +// out.name(DATA_VALUE_TYPE_KEY); +// out.value(value.getClass().getCanonicalName()); +// out.name(DATA_VALUE_KEY); +// // use the gson instance of our surrounding class to serialize the +// // object +// out.jsonValue(gson.toJson(value)); +// out.endObject(); +// } +// +// @SuppressWarnings("deprecation") +// @Override +// public CrawleableUri read(JsonReader in) throws IOException { +// in.beginObject(); +// String uri = null; +// String key; +// InetAddress inetAddress = null; +// UriType type = UriType.UNKNOWN; +// Map data = new HashMap(); +// while (in.peek() != JsonToken.END_OBJECT) { +// key = in.nextName(); +// switch (key) { +// case URI_KEY: { +// uri = in.nextString(); +// break; +// } +// case URI_TYPE_KEY: { +// type = UriType.valueOf(in.nextString()); +// break; +// } +// case ADDRESS_KEY: { +// inetAddress = readInetAddress(in); +// break; +// } +// case DATA_KEY: { +// in.beginArray(); +// while (in.hasNext()) { +// readDataObject(in, data); +// } +// in.endArray(); +// break; +// } +// default: { +// LOGGER.error( +// "Got an unknown attribute name \"{}\" while parsing an CrawleableUri object. It will be ignored.", +// key); +// } +// } +// } +// in.endObject(); +// CrawleableUri result; +// try { +// result = new CrawleableUri(new URI(uri), inetAddress, type); +// } catch (URISyntaxException e) { +// throw new IOException(e); +// } +// result.setData(data); +// return result; +// } +// +// private InetAddress readInetAddress(JsonReader in) throws IOException { +// in.beginObject(); +// String host = null; +// String key; +// ByteArrayList ip = new ByteArrayList(); +// while (in.peek() != JsonToken.END_OBJECT) { +// key = in.nextName(); +// switch (key) { +// case ADDRESS_HOST_KEY: { +// host = in.nextString(); +// break; +// } +// case ADDRESS_IP_KEY: { +// in.beginArray(); +// while (in.hasNext()) { +// ip.add((byte) in.nextLong()); +// } +// in.endArray(); +// break; +// } +// default: { +// LOGGER.error( +// "Got an unknown attribute name \"{}\" while parsing an InetAddress object. It will be ignored.", +// key); +// } +// } +// } +// in.endObject(); +// if (host != null) { +// return InetAddress.getByAddress(host, ip.toArray()); +// } else { +// return InetAddress.getByAddress(ip.toArray()); +// } +// } +// +// private void readDataObject(JsonReader in, Map data) throws IOException { +// in.beginObject(); +// String key; +// String name = null; +// String valueType = null; +// Object value = null; +// while (in.peek() != JsonToken.END_OBJECT) { +// key = in.nextName(); +// switch (key) { +// case DATA_NAME_KEY: { +// name = in.nextString(); +// break; +// } +// case DATA_VALUE_TYPE_KEY: { +// valueType = in.nextString(); +// break; +// } +// case DATA_VALUE_KEY: { +// if (valueType != null) { +// try { +// value = gson.fromJson(in, Class.forName(valueType)); +// } catch (ClassNotFoundException e) { +// throw new IOException(e); +// } +// } else { +// LOGGER.error( +// "Couldn't read Object of {} because the value type was not defined before reading the value. It will be skipped.", +// name); +// in.skipValue(); +// } +// break; +// } +// default: { +// LOGGER.error("Got an unknown attribute name \"{}\" while parsing an object. It will be ignored.", +// key); +// } +// } +// } +// if ((name != null) && (value != null)) { +// data.put(name, value); +// } +// in.endObject(); +// } +// +// } } From 8971a2251007643da12c492ac032b319fc704b2b Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 16:33:07 +0200 Subject: [PATCH 094/102] codacy quality review --- .../uri/serialize/gson/GsonUriSerializer.java | 404 +++++++++--------- .../hashing/UriHashCustodian.java | 2 +- .../squirrel/queue/DomainUriTypePair.java | 5 +- .../org/dice_research/squirrel/sink/Sink.java | 4 +- .../QuadBasedSink.java | 2 +- .../AdvancedTripleBasedSink.java | 2 +- .../TripleBasedSink.java | 2 +- .../dice_research/squirrel/vocab/VCard.java | 7 +- .../components/DeduplicatorComponent.java | 2 +- .../data/uri/filter/MongoDBKnowUriFilter.java | 6 +- .../impl/QueueBasedTerminationCheck.java | 7 +- .../frontier/utils/SimpleDomainExtractor.java | 152 +++---- .../graph/impl/TabSeparatedGraphLogger.java | 15 +- .../norm/WellKnownPathUriGeneratorTest.java | 2 +- .../impl/CkanSeedGeneratorImplTest.java | 24 +- .../impl/MicroformatMF2JAnalyzer.java | 10 +- .../impl/html/scraper/YamlFileAtributes.java | 10 +- .../impl/sparql/AbstractBufferingSink.java | 4 +- .../sink/impl/sparql/SparqlBasedSink.java | 2 +- .../squirrel/sink/impl/sparql/TDBSink.java | 2 +- 20 files changed, 332 insertions(+), 332 deletions(-) rename squirrel.api/src/main/java/org/dice_research/squirrel/sink/{quadBased => quadbased}/QuadBasedSink.java (90%) rename squirrel.api/src/main/java/org/dice_research/squirrel/sink/{tripleBased => triplebased}/AdvancedTripleBasedSink.java (91%) rename squirrel.api/src/main/java/org/dice_research/squirrel/sink/{tripleBased => triplebased}/TripleBasedSink.java (90%) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java index e5d6ce49f..65fe9e85a 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java @@ -1,11 +1,26 @@ package org.dice_research.squirrel.data.uri.serialize.gson; +import java.io.IOException; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.Map; + import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.serialize.Serializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.carrotsearch.hppc.ByteArrayList; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import com.google.gson.TypeAdapter; +import com.google.gson.stream.JsonReader; +import com.google.gson.stream.JsonToken; +import com.google.gson.stream.JsonWriter; /** * A serializer that uses {@link Gson} to serialize URIs. Kept for backwards @@ -14,9 +29,10 @@ * @author Michael Röder (michael.roeder@uni-paderborn.de) * */ +@SuppressWarnings("deprecation") public class GsonUriSerializer implements Serializer { -// private static final Logger LOGGER = LoggerFactory.getLogger(GsonUriSerializer.class); + private static final Logger LOGGER = LoggerFactory.getLogger(GsonUriSerializer.class); private Gson gson; @@ -39,197 +55,197 @@ public T deserialize(byte[] data) { return (T) gson.fromJson(new String(data, Constants.DEFAULT_CHARSET), CrawleableUri.class); } -// private class CrawleableUriAdapter extends TypeAdapter { -// -// private static final String URI_KEY = "uri"; -// private static final String ADDRESS_KEY = "address"; -// private static final String ADDRESS_IP_KEY = "ip"; -// private static final String ADDRESS_HOST_KEY = "host"; -// private static final String URI_TYPE_KEY = "type"; -// private static final String DATA_KEY = "data"; -// private static final String DATA_NAME_KEY = "name"; -// private static final String DATA_VALUE_KEY = "value"; -// private static final String DATA_VALUE_TYPE_KEY = "type"; -// -// @SuppressWarnings("deprecation") -// @Override -// public void write(JsonWriter out, CrawleableUri uri) throws IOException { -// out.beginObject(); -// out.name(URI_KEY); -// out.value(uri.getUri().toString()); -// out.name(URI_TYPE_KEY); -// out.value(uri.getType().name()); -// if (uri.getIpAddress() != null) { -// out.name(ADDRESS_KEY); -// writeInetAddress(out, uri.getIpAddress()); -// } -// out.name(DATA_KEY); -// out.beginArray(); -// Map data = uri.getData(); -// for (String key : data.keySet()) { -// writeDataEntry(out, key, data.get(key)); -// } -// out.endArray(); -// out.endObject(); -// } -// -// private void writeInetAddress(JsonWriter out, InetAddress ipAddress) throws IOException { -// out.beginObject(); -// if (ipAddress.getHostName() != null) { -// out.name(ADDRESS_HOST_KEY); -// out.value(ipAddress.getHostName()); -// } -// out.name(ADDRESS_IP_KEY); -// byte ip[] = ipAddress.getAddress(); -// out.beginArray(); -// for (int i = 0; i < ip.length; ++i) { -// out.value(ip[i]); -// } -// out.endArray(); -// out.endObject(); -// } -// -// private void writeDataEntry(JsonWriter out, String key, Object value) throws IOException { -// out.beginObject(); -// out.name(DATA_NAME_KEY); -// out.value(key); -// out.name(DATA_VALUE_TYPE_KEY); -// out.value(value.getClass().getCanonicalName()); -// out.name(DATA_VALUE_KEY); -// // use the gson instance of our surrounding class to serialize the -// // object -// out.jsonValue(gson.toJson(value)); -// out.endObject(); -// } -// -// @SuppressWarnings("deprecation") -// @Override -// public CrawleableUri read(JsonReader in) throws IOException { -// in.beginObject(); -// String uri = null; -// String key; -// InetAddress inetAddress = null; -// UriType type = UriType.UNKNOWN; -// Map data = new HashMap(); -// while (in.peek() != JsonToken.END_OBJECT) { -// key = in.nextName(); -// switch (key) { -// case URI_KEY: { -// uri = in.nextString(); -// break; -// } -// case URI_TYPE_KEY: { -// type = UriType.valueOf(in.nextString()); -// break; -// } -// case ADDRESS_KEY: { -// inetAddress = readInetAddress(in); -// break; -// } -// case DATA_KEY: { -// in.beginArray(); -// while (in.hasNext()) { -// readDataObject(in, data); -// } -// in.endArray(); -// break; -// } -// default: { -// LOGGER.error( -// "Got an unknown attribute name \"{}\" while parsing an CrawleableUri object. It will be ignored.", -// key); -// } -// } -// } -// in.endObject(); -// CrawleableUri result; -// try { -// result = new CrawleableUri(new URI(uri), inetAddress, type); -// } catch (URISyntaxException e) { -// throw new IOException(e); -// } -// result.setData(data); -// return result; -// } -// -// private InetAddress readInetAddress(JsonReader in) throws IOException { -// in.beginObject(); -// String host = null; -// String key; -// ByteArrayList ip = new ByteArrayList(); -// while (in.peek() != JsonToken.END_OBJECT) { -// key = in.nextName(); -// switch (key) { -// case ADDRESS_HOST_KEY: { -// host = in.nextString(); -// break; -// } -// case ADDRESS_IP_KEY: { -// in.beginArray(); -// while (in.hasNext()) { -// ip.add((byte) in.nextLong()); -// } -// in.endArray(); -// break; -// } -// default: { -// LOGGER.error( -// "Got an unknown attribute name \"{}\" while parsing an InetAddress object. It will be ignored.", -// key); -// } -// } -// } -// in.endObject(); -// if (host != null) { -// return InetAddress.getByAddress(host, ip.toArray()); -// } else { -// return InetAddress.getByAddress(ip.toArray()); -// } -// } -// -// private void readDataObject(JsonReader in, Map data) throws IOException { -// in.beginObject(); -// String key; -// String name = null; -// String valueType = null; -// Object value = null; -// while (in.peek() != JsonToken.END_OBJECT) { -// key = in.nextName(); -// switch (key) { -// case DATA_NAME_KEY: { -// name = in.nextString(); -// break; -// } -// case DATA_VALUE_TYPE_KEY: { -// valueType = in.nextString(); -// break; -// } -// case DATA_VALUE_KEY: { -// if (valueType != null) { -// try { -// value = gson.fromJson(in, Class.forName(valueType)); -// } catch (ClassNotFoundException e) { -// throw new IOException(e); -// } -// } else { -// LOGGER.error( -// "Couldn't read Object of {} because the value type was not defined before reading the value. It will be skipped.", -// name); -// in.skipValue(); -// } -// break; -// } -// default: { -// LOGGER.error("Got an unknown attribute name \"{}\" while parsing an object. It will be ignored.", -// key); -// } -// } -// } -// if ((name != null) && (value != null)) { -// data.put(name, value); -// } -// in.endObject(); -// } -// -// } + private class CrawleableUriAdapter extends TypeAdapter { + + private static final String URI_KEY = "uri"; + private static final String ADDRESS_KEY = "address"; + private static final String ADDRESS_IP_KEY = "ip"; + private static final String ADDRESS_HOST_KEY = "host"; + private static final String URI_TYPE_KEY = "type"; + private static final String DATA_KEY = "data"; + private static final String DATA_NAME_KEY = "name"; + private static final String DATA_VALUE_KEY = "value"; + private static final String DATA_VALUE_TYPE_KEY = "type"; + + @SuppressWarnings("deprecation") + @Override + public void write(JsonWriter out, CrawleableUri uri) throws IOException { + out.beginObject(); + out.name(URI_KEY); + out.value(uri.getUri().toString()); + out.name(URI_TYPE_KEY); + out.value(uri.getType().name()); + if (uri.getIpAddress() != null) { + out.name(ADDRESS_KEY); + writeInetAddress(out, uri.getIpAddress()); + } + out.name(DATA_KEY); + out.beginArray(); + Map data = uri.getData(); + for (String key : data.keySet()) { + writeDataEntry(out, key, data.get(key)); + } + out.endArray(); + out.endObject(); + } + + private void writeInetAddress(JsonWriter out, InetAddress ipAddress) throws IOException { + out.beginObject(); + if (ipAddress.getHostName() != null) { + out.name(ADDRESS_HOST_KEY); + out.value(ipAddress.getHostName()); + } + out.name(ADDRESS_IP_KEY); + byte ip[] = ipAddress.getAddress(); + out.beginArray(); + for (int i = 0; i < ip.length; ++i) { + out.value(ip[i]); + } + out.endArray(); + out.endObject(); + } + + private void writeDataEntry(JsonWriter out, String key, Object value) throws IOException { + out.beginObject(); + out.name(DATA_NAME_KEY); + out.value(key); + out.name(DATA_VALUE_TYPE_KEY); + out.value(value.getClass().getCanonicalName()); + out.name(DATA_VALUE_KEY); + // use the gson instance of our surrounding class to serialize the + // object + out.jsonValue(gson.toJson(value)); + out.endObject(); + } + + @SuppressWarnings("deprecation") + @Override + public CrawleableUri read(JsonReader in) throws IOException { + in.beginObject(); + String uri = null; + String key; + InetAddress inetAddress = null; + UriType type = UriType.UNKNOWN; + Map data = new HashMap(); + while (in.peek() != JsonToken.END_OBJECT) { + key = in.nextName(); + switch (key) { + case URI_KEY: { + uri = in.nextString(); + break; + } + case URI_TYPE_KEY: { + type = UriType.valueOf(in.nextString()); + break; + } + case ADDRESS_KEY: { + inetAddress = readInetAddress(in); + break; + } + case DATA_KEY: { + in.beginArray(); + while (in.hasNext()) { + readDataObject(in, data); + } + in.endArray(); + break; + } + default: { + LOGGER.error( + "Got an unknown attribute name \"{}\" while parsing an CrawleableUri object. It will be ignored.", + key); + } + } + } + in.endObject(); + CrawleableUri result; + try { + result = new CrawleableUri(new URI(uri), inetAddress, type); + } catch (URISyntaxException e) { + throw new IOException(e); + } + result.setData(data); + return result; + } + + private InetAddress readInetAddress(JsonReader in) throws IOException { + in.beginObject(); + String host = null; + String key; + ByteArrayList ip = new ByteArrayList(); + while (in.peek() != JsonToken.END_OBJECT) { + key = in.nextName(); + switch (key) { + case ADDRESS_HOST_KEY: { + host = in.nextString(); + break; + } + case ADDRESS_IP_KEY: { + in.beginArray(); + while (in.hasNext()) { + ip.add((byte) in.nextLong()); + } + in.endArray(); + break; + } + default: { + LOGGER.error( + "Got an unknown attribute name \"{}\" while parsing an InetAddress object. It will be ignored.", + key); + } + } + } + in.endObject(); + if (host != null) { + return InetAddress.getByAddress(host, ip.toArray()); + } else { + return InetAddress.getByAddress(ip.toArray()); + } + } + + private void readDataObject(JsonReader in, Map data) throws IOException { + in.beginObject(); + String key; + String name = null; + String valueType = null; + Object value = null; + while (in.peek() != JsonToken.END_OBJECT) { + key = in.nextName(); + switch (key) { + case DATA_NAME_KEY: { + name = in.nextString(); + break; + } + case DATA_VALUE_TYPE_KEY: { + valueType = in.nextString(); + break; + } + case DATA_VALUE_KEY: { + if (valueType != null) { + try { + value = gson.fromJson(in, Class.forName(valueType)); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } else { + LOGGER.error( + "Couldn't read Object of {} because the value type was not defined before reading the value. It will be skipped.", + name); + in.skipValue(); + } + break; + } + default: { + LOGGER.error("Got an unknown attribute name \"{}\" while parsing an object. It will be ignored.", + key); + } + } + } + if ((name != null) && (value != null)) { + data.put(name, value); + } + in.endObject(); + } + + } } diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/UriHashCustodian.java b/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/UriHashCustodian.java index a2b8f67df..ad0ac03bf 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/UriHashCustodian.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/deduplication/hashing/UriHashCustodian.java @@ -9,7 +9,7 @@ * This component maintains {@link HashValue}s for uris. It provides methods for getting uris with some desired hash values, * and also for adding hash values for given uris. * The idea is that hash values could be stored in the {@link org.dice_research.squirrel.data.uri.filter.KnownUriFilter}, - * as well as in the {@link org.dice_research.squirrel.sink.tripleBased.TripleBasedSink}. + * as well as in the {@link org.dice_research.squirrel.sink.triplebased.TripleBasedSink}. */ public interface UriHashCustodian { diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/DomainUriTypePair.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/DomainUriTypePair.java index 1780f5a91..1cb66dd9f 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/DomainUriTypePair.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/DomainUriTypePair.java @@ -42,9 +42,8 @@ public boolean equals(Object obj) { return false; } else if (!domain.equals(other.domain)) return false; - if (type != other.type) - return false; - return true; + + return (type != other.type); } @Override diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/Sink.java b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/Sink.java index 27aad2086..2ecf8be50 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/Sink.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/Sink.java @@ -4,8 +4,8 @@ import org.apache.jena.rdf.model.StmtIterator; import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.sink.quadBased.QuadBasedSink; -import org.dice_research.squirrel.sink.tripleBased.TripleBasedSink; +import org.dice_research.squirrel.sink.quadbased.QuadBasedSink; +import org.dice_research.squirrel.sink.triplebased.TripleBasedSink; import org.springframework.stereotype.Component; import java.io.Closeable; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadBased/QuadBasedSink.java b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadbased/QuadBasedSink.java similarity index 90% rename from squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadBased/QuadBasedSink.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadbased/QuadBasedSink.java index 8020b739d..1d24334e6 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadBased/QuadBasedSink.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/quadbased/QuadBasedSink.java @@ -1,4 +1,4 @@ -package org.dice_research.squirrel.sink.quadBased; +package org.dice_research.squirrel.sink.quadbased; import org.apache.jena.sparql.core.Quad; import org.dice_research.squirrel.data.uri.CrawleableUri; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/AdvancedTripleBasedSink.java b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/AdvancedTripleBasedSink.java similarity index 91% rename from squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/AdvancedTripleBasedSink.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/AdvancedTripleBasedSink.java index 7b3c2ff33..7a7e9f54e 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/AdvancedTripleBasedSink.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/AdvancedTripleBasedSink.java @@ -1,4 +1,4 @@ -package org.dice_research.squirrel.sink.tripleBased; +package org.dice_research.squirrel.sink.triplebased; import org.apache.jena.graph.Triple; import org.dice_research.squirrel.data.uri.CrawleableUri; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/TripleBasedSink.java b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/TripleBasedSink.java similarity index 90% rename from squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/TripleBasedSink.java rename to squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/TripleBasedSink.java index ca8214b78..848e02d75 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/sink/tripleBased/TripleBasedSink.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/sink/triplebased/TripleBasedSink.java @@ -1,4 +1,4 @@ -package org.dice_research.squirrel.sink.tripleBased; +package org.dice_research.squirrel.sink.triplebased; import org.apache.jena.graph.Triple; import org.dice_research.squirrel.data.uri.CrawleableUri; diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/VCard.java b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/VCard.java index 692b6e276..3b957e6e5 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/VCard.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/vocab/VCard.java @@ -10,6 +10,10 @@ public class VCard { * The namespace of the vocabulary as a string */ public static final String uri = "https://www.w3.org/2006/vcard/ns#"; + + public static final Resource Kind = resource("Kind"); + public static final Property fn = property("fn"); + public static final Property hasEmail = property("hasEmail"); /** * returns the URI for this schema @@ -28,8 +32,5 @@ protected static final Property property(String local) { return ResourceFactory.createProperty(uri, local); } - public static final Resource Kind = resource("Kind"); - public static final Property fn = property("fn"); - public static final Property hasEmail = property("hasEmail"); } diff --git a/squirrel.deduplication/src/main/java/org/dice_research/squirrel/components/DeduplicatorComponent.java b/squirrel.deduplication/src/main/java/org/dice_research/squirrel/components/DeduplicatorComponent.java index 5ced06de4..106fb7e4a 100644 --- a/squirrel.deduplication/src/main/java/org/dice_research/squirrel/components/DeduplicatorComponent.java +++ b/squirrel.deduplication/src/main/java/org/dice_research/squirrel/components/DeduplicatorComponent.java @@ -25,7 +25,7 @@ import org.dice_research.squirrel.rabbit.RespondingDataHandler; import org.dice_research.squirrel.rabbit.ResponseHandler; import org.dice_research.squirrel.rabbit.msgs.UriSet; -import org.dice_research.squirrel.sink.tripleBased.AdvancedTripleBasedSink; +import org.dice_research.squirrel.sink.triplebased.AdvancedTripleBasedSink; import org.hobbit.core.components.AbstractComponent; import org.hobbit.core.data.RabbitQueue; import org.hobbit.core.rabbit.DataReceiverImpl; diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 103841ba1..4d4557c99 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -130,11 +130,7 @@ public void open() { public boolean knowUriTableExists() { for (String collection : mongoDB.listCollectionNames()) { - if (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) { - return true; - } else { - return false; - } + return (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())); } return false; } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/QueueBasedTerminationCheck.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/QueueBasedTerminationCheck.java index 677ada8d7..2369de0af 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/QueueBasedTerminationCheck.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/QueueBasedTerminationCheck.java @@ -7,12 +7,9 @@ public class QueueBasedTerminationCheck implements TerminationCheck { @Override public boolean shouldFrontierTerminate(UriQueue queue) { + + return (queue.isEmpty()); - if(queue.isEmpty()) { - return true; - } else { - return false; - } } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/utils/SimpleDomainExtractor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/utils/SimpleDomainExtractor.java index 7f31ed72b..42a208c84 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/utils/SimpleDomainExtractor.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/utils/SimpleDomainExtractor.java @@ -24,82 +24,82 @@ */ public class SimpleDomainExtractor { - private static final String DOMAIN_PREFIX = "://"; - private static final int DOMAIN_PREFIX_LENGTH = DOMAIN_PREFIX.length(); + private static final String DOMAIN_PREFIX = "://"; + private static final int DOMAIN_PREFIX_LENGTH = DOMAIN_PREFIX.length(); - /** - * Extracts domain and path of a URI without adding the name of the - * resource. If "http://example.org/resource/test" is used as input this - * method will return "example.org". - * - * @param uri - * the URI as String - * @return the extracted domain and path - */ - public static String extractDomain(String uri) { - if (uri == null) { - return null; - } - // get the start position of the domain - int startPos = uri.indexOf(DOMAIN_PREFIX); - if (startPos < 0) { - startPos = 0; - } else { - startPos += DOMAIN_PREFIX_LENGTH; - } - // find the end position of the String - char chars[] = uri.toCharArray(); - for (int i = startPos; i < chars.length; ++i) { - switch (chars[i]) { - // if this is a character that is not part of the domain, anymore - case '/': - case ':': { - return uri.substring(startPos, i); - } - default: { - // nothing to do - } - } - } - // we couldn't find the end, but maybe we have found a start - return uri.substring(startPos); - } + /** + * Extracts domain and path of a URI without adding the name of the resource. If + * "http://example.org/resource/test" is used as input this method will return + * "example.org". + * + * @param uri the URI as String + * @return the extracted domain and path + */ + public static String extractDomain(String uri) { + if (uri == null) { + return null; + } + // get the start position of the domain + int startPos = uri.indexOf(DOMAIN_PREFIX); + if (startPos < 0) { + startPos = 0; + } else { + startPos += DOMAIN_PREFIX_LENGTH; + } + // find the end position of the String + char chars[] = uri.toCharArray(); + for (int i = startPos; i < chars.length; ++i) { + switch (chars[i]) { + // if this is a character that is not part of the domain, anymore + case '/': + break; + case ':': { + return uri.substring(startPos, i); + } + + default: { + // nothing to do + } + } + } + // we couldn't find the end, but maybe we have found a start + return uri.substring(startPos); + } - /** - * Extracts domain and path of a URI without adding the name of the - * resource. If "http://example.org/resource/test" is used as input this - * method will return "example.org/resource/". - * - * @param uri - * the URI as String - * @return the extracted domain and path - */ - public static String extractDomainAndPath(String uri) { - if (uri == null) { - return null; - } - // get the start position of the domain - int startPos = uri.indexOf(DOMAIN_PREFIX); - if (startPos < 0) { - startPos = 0; - } else { - startPos += DOMAIN_PREFIX_LENGTH; - } - // find the end position of the String - char chars[] = uri.toCharArray(); - for (int i = (chars.length - 1); i > startPos; --i) { - switch (chars[i]) { - // if this is a character that is not part of the resource, anymore - case '/': - case '#': { - return uri.substring(startPos, i); - } - default: { - // nothing to do - } - } - } - // we couldn't find the end, but maybe we have found a start - return uri.substring(startPos); - } + /** + * Extracts domain and path of a URI without adding the name of the resource. If + * "http://example.org/resource/test" is used as input this method will return + * "example.org/resource/". + * + * @param uri the URI as String + * @return the extracted domain and path + */ + public static String extractDomainAndPath(String uri) { + if (uri == null) { + return null; + } + // get the start position of the domain + int startPos = uri.indexOf(DOMAIN_PREFIX); + if (startPos < 0) { + startPos = 0; + } else { + startPos += DOMAIN_PREFIX_LENGTH; + } + // find the end position of the String + char chars[] = uri.toCharArray(); + for (int i = (chars.length - 1); i > startPos; --i) { + switch (chars[i]) { + // if this is a character that is not part of the resource, anymore + case '/': + case '#': { + return uri.substring(startPos, i); + } + default: { + // nothing to do + } + } + } + // we couldn't find the end, but maybe we have found a start + return uri.substring(startPos); + } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/graph/impl/TabSeparatedGraphLogger.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/graph/impl/TabSeparatedGraphLogger.java index 8a1c99877..b7a3ca589 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/graph/impl/TabSeparatedGraphLogger.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/graph/impl/TabSeparatedGraphLogger.java @@ -26,13 +26,6 @@ public class TabSeparatedGraphLogger implements GraphLogger, Closeable { public static final char DEFAULT_URI_SEPARATOR = '|'; public static final char DEFAULT_QUOTE_CHAR = '"'; public static final char DEFAULT_ESCAPE_CHAR = '\\'; - - private static final Charset CHARSET = Charset.forName("UTF-8"); - - public static TabSeparatedGraphLogger create(File logFile) throws FileNotFoundException { - return new TabSeparatedGraphLogger(new FileOutputStream(logFile)); - } - protected OutputStream logStream; protected char sourceTargetSeperator = DEFAULT_SOURCE_TARGET_SEPARATOR; protected char uriSeparator = DEFAULT_URI_SEPARATOR; @@ -42,6 +35,14 @@ public static TabSeparatedGraphLogger create(File logFile) throws FileNotFoundEx protected String escapedQuote = new String(new char[] { escapeChar, quoteChar }); protected String linebreak = String.format("%n"); + private static final Charset CHARSET = Charset.forName("UTF-8"); + + public static TabSeparatedGraphLogger create(File logFile) throws FileNotFoundException { + return new TabSeparatedGraphLogger(new FileOutputStream(logFile)); + } + + + public TabSeparatedGraphLogger(OutputStream logStream) { this.logStream = logStream; } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/norm/WellKnownPathUriGeneratorTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/norm/WellKnownPathUriGeneratorTest.java index 1e0ebe8df..7831e941a 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/norm/WellKnownPathUriGeneratorTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/data/uri/norm/WellKnownPathUriGeneratorTest.java @@ -8,7 +8,7 @@ import java.net.URISyntaxException; public class WellKnownPathUriGeneratorTest { - UriGenerator variantUriObject = new WellKnownPathUriGenerator(); + private UriGenerator variantUriObject = new WellKnownPathUriGenerator(); @Test public void getVariant(){ diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 8532a7e6e..6a317aea5 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -7,8 +7,8 @@ import org.apache.commons.collections15.map.HashedMap; import org.dice_research.squirrel.data.uri.filter.InMemoryKnownUriFilter; -import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; +import org.dice_research.squirrel.data.uri.filter.UriFilterConfigurator; import org.dice_research.squirrel.data.uri.norm.DomainBasedUriGenerator; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; import org.dice_research.squirrel.data.uri.norm.UriGenerator; @@ -17,8 +17,6 @@ import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.IpAddressBasedQueue; -import org.junit.Ignore; -import org.junit.Test; import junit.framework.TestCase; @@ -27,14 +25,13 @@ * * Created by ivan on 04.03.16. */ +@SuppressWarnings("deprecation") public class CkanSeedGeneratorImplTest extends TestCase { - private CkanSeedGeneratorImpl ckanSeedGenerator; - private IpAddressBasedQueue queue; private Frontier frontier; public void setUp() { - queue = new InMemoryQueue(); + IpAddressBasedQueue queue = new InMemoryQueue(); //frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,null); @@ -48,18 +45,9 @@ public void setUp() { UriFilterComposer relationalUriFilter = new UriFilterConfigurator(new InMemoryKnownUriFilter(false, -1),""); frontier = new FrontierImpl(new NormalizerImpl(sessionIDs,mapDefaultPort), relationalUriFilter, queue,uriGenerators); - ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); + CkanSeedGeneratorImpl ckanSeedGenerator = new CkanSeedGeneratorImpl(frontier); + ckanSeedGenerator.toString(); } - /** - * Get list of URIs to crawl from datahub.io - * Should be more than 100 URIs - * Actual size is a bit more than 5000 URIs - */ - @Ignore - @Test - public void testGetSeed() { - //List seedUris = ckanSeedGenerator.getSeed(); - //assertTrue(seedUris.size() > 100); - } + } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java index 9562d9c34..fdae7583c 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/MicroformatMF2JAnalyzer.java @@ -84,11 +84,11 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { } public static String addContextToJSON(String data) { - data = data.trim(); - data = data.substring(1); - data = "{\r\n" + - "\"@context\": {\"@vocab\": \"http://www.dummy.org/#\"},\n"+data; - return data; + String dt = data.trim(); + dt = dt.substring(1); + dt = "{\r\n" + + "\"@context\": {\"@vocab\": \"http://www.dummy.org/#\"},\n"+dt; + return dt; } public static String replaceVocab(String data) { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFileAtributes.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFileAtributes.java index 3a52bd686..4edd6f7d6 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFileAtributes.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/YamlFileAtributes.java @@ -1,15 +1,17 @@ package org.dice_research.squirrel.analyzer.impl.html.scraper; public class YamlFileAtributes { + + protected static final String SEARCH_CHECK = "check"; + protected static final String SEARCH_DOMAIN = "domain"; + protected static final String REGEX = "regex"; + protected static final String RESOURCES = "resources"; private YamlFileAtributes() { } - protected static final String SEARCH_CHECK = "check"; - protected static final String SEARCH_DOMAIN = "domain"; - protected static final String REGEX = "regex"; - protected static final String RESOURCES = "resources"; + } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/AbstractBufferingSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/AbstractBufferingSink.java index 9db053115..cefe50fd9 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/AbstractBufferingSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/AbstractBufferingSink.java @@ -10,8 +10,8 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.metadata.CrawlingActivity; -import org.dice_research.squirrel.sink.quadBased.QuadBasedSink; -import org.dice_research.squirrel.sink.tripleBased.TripleBasedSink; +import org.dice_research.squirrel.sink.quadbased.QuadBasedSink; +import org.dice_research.squirrel.sink.triplebased.TripleBasedSink; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/SparqlBasedSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/SparqlBasedSink.java index 1f84cb205..ec888b7f3 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/SparqlBasedSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/SparqlBasedSink.java @@ -36,7 +36,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.metadata.CrawlingActivity; import org.dice_research.squirrel.sink.Sink; -import org.dice_research.squirrel.sink.tripleBased.AdvancedTripleBasedSink; +import org.dice_research.squirrel.sink.triplebased.AdvancedTripleBasedSink; import org.dice_research.squirrel.vocab.Squirrel; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java index 96a2aca3a..f3cd78a14 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java @@ -23,7 +23,7 @@ import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.sink.Sink; -import org.dice_research.squirrel.sink.tripleBased.AdvancedTripleBasedSink; +import org.dice_research.squirrel.sink.triplebased.AdvancedTripleBasedSink; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From 8850df67df087e2eb8f5e7eef4767fe94577c213 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 16:56:55 +0200 Subject: [PATCH 095/102] fixed serialization test --- .../data/uri/serialize/gson/GsonUriSerializer.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java index 65fe9e85a..5977496ff 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/serialize/gson/GsonUriSerializer.java @@ -29,11 +29,11 @@ * @author Michael Röder (michael.roeder@uni-paderborn.de) * */ -@SuppressWarnings("deprecation") public class GsonUriSerializer implements Serializer { private static final Logger LOGGER = LoggerFactory.getLogger(GsonUriSerializer.class); + private GsonBuilder builder; private Gson gson; public GsonUriSerializer() { @@ -41,7 +41,9 @@ public GsonUriSerializer() { } public GsonUriSerializer(GsonBuilder builder) { - gson = builder.create(); + this.builder = builder; + this.builder.registerTypeAdapter(CrawleableUri.class, new CrawleableUriAdapter()); + gson = this.builder.create(); } @Override @@ -248,4 +250,4 @@ private void readDataObject(JsonReader in, Map data) throws IOEx } } -} +} \ No newline at end of file From 55339cbc6f77b60840880f2a7f377591962c335d Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 17:07:11 +0200 Subject: [PATCH 096/102] removed branching statement as the last in a loop --- .../squirrel/data/uri/filter/MongoDBKnowUriFilter.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 4d4557c99..4057a939d 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -129,10 +129,12 @@ public void open() { } public boolean knowUriTableExists() { + boolean ret = false; for (String collection : mongoDB.listCollectionNames()) { - return (collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())); + ret = collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase()); + break; } - return false; + return ret; } @Override From 384ad7afb881399aadb14db981999937c463d7df Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 17:41:53 +0200 Subject: [PATCH 097/102] codacy quality fixes --- .../squirrel/data/uri/CrawleableUri.java | 41 ++++++++++--------- .../uri/filter/RegexBasedWhiteListFilter.java | 3 +- .../squirrel/frontier/impl/FrontierImpl.java | 25 +++++------ .../impl/FrontierSenderToWebservice.java | 3 +- .../frontier/impl/FrontierImplTest.java | 6 +-- .../impl/CkanSeedGeneratorImplTest.java | 5 ++- .../simulation/ScenarioBasedTest.java | 7 ++-- .../src/main/java/com/SquirrelWebObject.java | 3 +- .../test/java/com/SquirrelWebObjectTest.java | 6 +-- .../com/squirrel/rabbit/RabbitMQListener.java | 8 ++-- .../impl/html/scraper/HtmlScraper.java | 3 +- .../squirrel/fetcher/dump/DumpFetcher.java | 2 +- .../squirrel/sink/impl/hdt/HdtBasedSink.java | 3 -- .../analyzer/impl/MicrodataParserTest.java | 6 --- 14 files changed, 60 insertions(+), 61 deletions(-) diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java index e9af597de..e31703bc3 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/data/uri/CrawleableUri.java @@ -30,6 +30,27 @@ public class CrawleableUri implements Serializable { private static final String CHARSET_NAME = "UTF-8"; private static final Charset ENCODING_CHARSET = Charset.forName(CHARSET_NAME); private static final int URI_START_INDEX = 5; + /** + * The URI. + */ + private final URI uri; + /** + * The IP address of the URI. + */ + private InetAddress ipAddress; + @Deprecated + private UriType type = UriType.UNKNOWN; + /** + * The data attached to this URI. + */ + private Map data = new TreeMap<>(); + /** + * Timestamp at which this URI should be crawled next time. + * + * @deprecated The timestamp should be added to the {@link #data} map. + */ + private long timestampNextCrawl; + /** * Creates a CrawleableUri object from the given byte array. @@ -95,26 +116,6 @@ public static CrawleableUri fromByteBuffer(ByteBuffer buffer) { return new CrawleableUri(uri, ipAddress, UriType.values()[typeId]); } - /** - * The URI. - */ - private final URI uri; - /** - * The IP address of the URI. - */ - private InetAddress ipAddress; - @Deprecated - private UriType type = UriType.UNKNOWN; - /** - * The data attached to this URI. - */ - private Map data = new TreeMap<>(); - /** - * Timestamp at which this URI should be crawled next time. - * - * @deprecated The timestamp should be added to the {@link #data} map. - */ - private long timestampNextCrawl; /** * Constructor. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RegexBasedWhiteListFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RegexBasedWhiteListFilter.java index 0a02b9ecd..45450774a 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RegexBasedWhiteListFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/RegexBasedWhiteListFilter.java @@ -18,6 +18,8 @@ public class RegexBasedWhiteListFilter extends AbstractKnownUriFilterDecorator { private static final Logger LOGGER = LoggerFactory.getLogger(RegexBasedWhiteListFilter.class); + private Set whiteList; + public static RegexBasedWhiteListFilter create(KnownUriFilter decorated, File whitelistfile) { try { @@ -41,7 +43,6 @@ public static RegexBasedWhiteListFilter create(KnownUriFilter decorated, File wh return null; } - private Set whiteList; public RegexBasedWhiteListFilter(KnownUriFilter decorated, Set whiteList) { super(decorated); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index a927fe802..e35dd293e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -284,30 +284,31 @@ public void addNewUri(CrawleableUri uri) { } protected void addNormalizedUri(CrawleableUri uri) { - if (uriFilter.isUriGood(uri)) { - LOGGER.debug("addNewUri(" + uri + "): URI is good [" + uriFilter + "]"); - if (schemeUriFilter.isUriGood(uri)) { - LOGGER.trace("addNewUri(" + uri.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); + CrawleableUri curi = uri; + if (uriFilter.isUriGood(curi)) { + LOGGER.debug("addNewUri(" + curi + "): URI is good [" + uriFilter + "]"); + if (schemeUriFilter.isUriGood(curi)) { + LOGGER.trace("addNewUri(" + curi.getUri() + "): URI schemes is OK [" + schemeUriFilter + "]"); // Make sure that the IP is known try { - uri = this.uriProcessor.recognizeInetAddress(uri); + curi = this.uriProcessor.recognizeInetAddress(curi); } catch (UnknownHostException e) { - LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri()); + LOGGER.error("Could not recognize IP for {}, unknown host", curi.getUri()); } - if (uri.getIpAddress() != null) { - queue.addUri(this.uriProcessor.recognizeUriType(uri)); + if (curi.getIpAddress() != null) { + queue.addUri(this.uriProcessor.recognizeUriType(curi)); } else { - LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri()); + LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", curi.getUri()); } - uriFilter.getKnownUriFilter().add(uri, System.currentTimeMillis()); + uriFilter.getKnownUriFilter().add(curi, System.currentTimeMillis()); } else { - LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + LOGGER.warn("addNewUri(" + curi + "): " + curi.getUri().getScheme() + " is not supported, only " + schemeUriFilter.getSchemes() + ". Will not added!"); } } else { - LOGGER.debug("addNewUri(" + uri + "): URI is not good [" + uriFilter + "]. Will not be added!"); + LOGGER.debug("addNewUri(" + curi + "): URI is not good [" + uriFilter + "]. Will not be added!"); } } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java index a116a7298..68224c768 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java @@ -45,7 +45,6 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { private URIReferences uriReferences; private final static String WEB_QUEUE_GENERAL_NAME = "squirrel.web.in"; private RabbitQueueFactory factory; - private Channel webQueue = null; private DataSender sender; private boolean run; @@ -91,7 +90,7 @@ private boolean init() { @SuppressWarnings("unused") private boolean establishChannel(int triesLeft) { try { - webQueue = factory.createChannel(); + Channel webQueue = factory.createChannel(); return true; } catch (IOException e) { LOGGER.warn("Connection to rabbit is stable, but there was an error while creating a channel/ queue: " + e.getMessage() + ". There are " + triesLeft + " tries left, try it again in 3s!"); diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index aaf9f5088..41f75a217 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -152,10 +152,10 @@ public void simlpeRecrawling() throws Exception { frontier.addNewUris(uris); List nextUris = frontier.getNextUris(); for (CrawleableUri uri : nextUris) { - Assert.assertTrue(uris.contains(uri)); + assertTrue(uris.contains(uri)); } for (CrawleableUri uri : uris) { - Assert.assertTrue(nextUris.contains(uri)); + assertTrue(nextUris.contains(uri)); } // Set the first URI as recrawlable for (CrawleableUri uri : nextUris) { @@ -169,7 +169,7 @@ public void simlpeRecrawling() throws Exception { nextUris = frontier.getNextUris(); Assert.assertNotNull(nextUris); assertTrue("uri_1 has been expected but couldn't be found", nextUris.contains(uri_1)); - Assert.assertEquals(1, nextUris.size()); + assertEquals(1, nextUris.size()); assertFalse("uri_2 has been found but was not expected", nextUris.contains(uri_2)); } diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java index 6a317aea5..bf00b1957 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/seed/generator/impl/CkanSeedGeneratorImplTest.java @@ -17,6 +17,8 @@ import org.dice_research.squirrel.frontier.impl.FrontierImpl; import org.dice_research.squirrel.queue.InMemoryQueue; import org.dice_research.squirrel.queue.IpAddressBasedQueue; +import org.junit.Ignore; +import org.junit.Test; import junit.framework.TestCase; @@ -26,10 +28,11 @@ * Created by ivan on 04.03.16. */ @SuppressWarnings("deprecation") +@Ignore public class CkanSeedGeneratorImplTest extends TestCase { private Frontier frontier; - + @Test public void setUp() { IpAddressBasedQueue queue = new InMemoryQueue(); //frontier = new FrontierImpl(new NormalizerImpl() , new InMemoryKnownUriFilter(false, -1), queue,null); diff --git a/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java b/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java index 30d31024f..d30be4584 100644 --- a/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java +++ b/squirrel.mockup/src/main/java/org/dice_research/squirrel/simulation/ScenarioBasedTest.java @@ -29,13 +29,15 @@ import org.junit.runners.Parameterized.Parameters; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.dice_research.squirrel.simulation.AbstractServerMockUsingTest; import org.springframework.context.support.FileSystemXmlApplicationContext; +@SuppressWarnings("deprecation") @RunWith(Parameterized.class) public class ScenarioBasedTest extends AbstractServerMockUsingTest { private static final Logger LOGGER = LoggerFactory.getLogger(ScenarioBasedTest.class); + private CrawleableUri[] seeds; + private CrawleableResource[] resources; @Parameters public static Collection data() throws Exception { @@ -107,8 +109,7 @@ public static Collection data() throws Exception { return scenarios; } - private CrawleableUri[] seeds; - private CrawleableResource[] resources; + public ScenarioBasedTest(CrawleableUri[] seeds, CrawleableResource[] resources) { super(new CrawleableResourceContainer(resources)); diff --git a/squirrel.web-api/src/main/java/com/SquirrelWebObject.java b/squirrel.web-api/src/main/java/com/SquirrelWebObject.java index a02557f08..2323a5096 100644 --- a/squirrel.web-api/src/main/java/com/SquirrelWebObject.java +++ b/squirrel.web-api/src/main/java/com/SquirrelWebObject.java @@ -119,7 +119,8 @@ private Map> stringToMap(String string) { return ret; } - StringBuilder bufferKey = new StringBuilder(), bufferValue = new StringBuilder(); + StringBuilder bufferKey = new StringBuilder(); + StringBuilder bufferValue = new StringBuilder(); boolean readKey = false; boolean readValue = false; for (int i = 0; i < string.length(); i++) { diff --git a/squirrel.web-api/src/test/java/com/SquirrelWebObjectTest.java b/squirrel.web-api/src/test/java/com/SquirrelWebObjectTest.java index b082ff0b9..c047b6044 100644 --- a/squirrel.web-api/src/test/java/com/SquirrelWebObjectTest.java +++ b/squirrel.web-api/src/test/java/com/SquirrelWebObjectTest.java @@ -10,9 +10,9 @@ public class SquirrelWebObjectTest { - SquirrelWebObject o; - List pendingURIlist; - Map> IPMapList; + private SquirrelWebObject o; + private List pendingURIlist; + private Map> IPMapList; @Before public void setUp() throws Exception { diff --git a/squirrel.web/src/main/java/com/squirrel/rabbit/RabbitMQListener.java b/squirrel.web/src/main/java/com/squirrel/rabbit/RabbitMQListener.java index 8aeced89c..bd3313982 100644 --- a/squirrel.web/src/main/java/com/squirrel/rabbit/RabbitMQListener.java +++ b/squirrel.web/src/main/java/com/squirrel/rabbit/RabbitMQListener.java @@ -162,7 +162,7 @@ public SquirrelWebObject getSquirrel() { * @param index All received {@link SquirrelWebObject} are stored in a list. Index {@code 0} is the oldest entry, Index {@code size-1} is the latest one * @return the {@link SquirrelWebObject} */ - SquirrelWebObject getSquirrel(int index) { + public SquirrelWebObject getSquirrel(int index) { SquirrelWebObject ret = getObject(dataQueue, index); return (ret == null) ? new SquirrelWebObject() : ret; } @@ -171,7 +171,7 @@ SquirrelWebObject getSquirrel(int index) { * Gets the fected crawled graph from Frontier. * @return the latest {@link VisualisationGraph} */ - VisualisationGraph getCrawledGraph() { + public VisualisationGraph getCrawledGraph() { return getCrawledGraph(dataQueue.size() - 1); } @@ -180,7 +180,7 @@ VisualisationGraph getCrawledGraph() { * @param index All received {@link VisualisationGraph} are stored in a list. Index {@code 0} is the oldest entry, Index {@code size-1} is the latest one * @return the {@link VisualisationGraph} */ - VisualisationGraph getCrawledGraph(int index) { + public VisualisationGraph getCrawledGraph(int index) { SquirrelWebObject preRet = getObject(dataQueue, index); VisualisationGraph ret; if (preRet == null || preRet.getGraph() == null) { @@ -210,7 +210,7 @@ private T getObject(List list, int index) { * * @return the number of {@link SquirrelWebObject}-objects, that were received from the WebService */ - int countSquirrelWebObjects() { + public int countSquirrelWebObjects() { return dataQueue.size(); } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java index 3d02beae6..a5f9f69e6 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/html/scraper/HtmlScraper.java @@ -337,7 +337,8 @@ private Set scrapeTree(Map mapEntry, Set triples return triples; } - private List jsoupQuery(String cssQuery) throws Exception { + private List jsoupQuery(String cssQ) throws Exception { + String cssQuery = cssQ; List listNodes = new ArrayList(); String prefix = ""; String attribute = ""; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/dump/DumpFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/dump/DumpFetcher.java index f43a4a520..642722fdd 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/dump/DumpFetcher.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/dump/DumpFetcher.java @@ -95,7 +95,7 @@ public void run() { // LOGGER.debug("Opening sink for {}", filePath); // sink.openSinkForUri(uri); while (iterator.hasNext()) { - Triple next = iterator.next(); + iterator.next(); // sink.addTriple(uri, next); ++tripleCount; } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSink.java index f37530220..a4211fb66 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/hdt/HdtBasedSink.java @@ -4,10 +4,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.net.URI; import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java index 6ff50b203..d6e9c216a 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicrodataParserTest.java @@ -59,13 +59,7 @@ public class MicrodataParserTest extends RDFParserTest { // static double[] falsenegativ = new double[data().size()]; // static double[] falsepositiv = new double[data().size()]; - @BeforeClass - public static void initialization () throws URISyntaxException { - } - - - @Parameters(name = "{index},{0},{1}") public static Collection data() { Object[][] data = new Object[][] { //Test+73 = Der jeweilige Test From 5fa672e42f5daa22358406ae580218b99985aac9 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Tue, 14 Jul 2020 17:56:49 +0200 Subject: [PATCH 098/102] improved codacy quality --- .../data/uri/filter/MongoDBKnowUriFilter.java | 4 ++-- .../squirrel/analyzer/impl/JsonAnalyzer.java | 5 +---- .../squirrel/sink/impl/sparql/TDBSink.java | 10 +++++----- .../squirrel/analyzer/impl/DecompressionTest.java | 4 +--- .../analyzer/impl/MicroformatParserTest.java | 12 +----------- 5 files changed, 10 insertions(+), 25 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 4057a939d..3502d4b2e 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -131,7 +131,7 @@ public void open() { public boolean knowUriTableExists() { boolean ret = false; for (String collection : mongoDB.listCollectionNames()) { - ret = collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase()); + ret = collection.equalsIgnoreCase(COLLECTION_NAME); break; } return ret; @@ -148,7 +148,7 @@ public void add(CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimest @Override public void addHashValuesForUris(List uris) { - + System.out.println(); } public void purge() { diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java index 40578e959..fec484c5b 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/JsonAnalyzer.java @@ -36,10 +36,7 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { @Override public boolean isElegible(CrawleableUri curi, File data) { - if (curi.getData("type").equals("json")) { - return true; - } - return false; + return curi.getData("type").equals("json"); } } diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java index f3cd78a14..42b35e30a 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/sink/impl/sparql/TDBSink.java @@ -37,10 +37,10 @@ public class TDBSink extends AbstractBufferingSink implements AdvancedTripleBase * The URI to the metadata DB in which updates can be performed. */ private final String updateMetaDataUri; - /** - * The URI to the metadata DB in which querys can be performed. - */ - private final String queryMetaDataUri; +// /** +// * The URI to the metadata DB in which querys can be performed. +// */ +// private final String queryMetaDataUri; /** * The URI of the DB in which updates can be performed. @@ -79,7 +79,7 @@ public TDBSink(String host, String port, String updateAppendix, String queryAppe updateDatasetURI = prefix + updateAppendix; queryDatasetURI = prefix + queryAppendix; updateMetaDataUri = prefix + updateMetaDataAppendix; - queryMetaDataUri = prefix + queryMetaDataAppendix; +// queryMetaDataUri = prefix + queryMetaDataAppendix; } @Override diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/DecompressionTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/DecompressionTest.java index fa7976fa8..b3daf7b46 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/DecompressionTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/DecompressionTest.java @@ -38,7 +38,6 @@ public class DecompressionTest { private FileManager fm = new FileManager(); - private List rdfFiles; private Model model; private CrawleableUri testUri; @@ -57,8 +56,7 @@ public class DecompressionTest { */ @Before public void initiateEnvironmentTest() throws IOException, URISyntaxException { - rdfFiles = createRdfFiles(); - tarFile = generateTar(rdfFiles); + tarFile = generateTar(createRdfFiles()); bzipFile = generateBzip(tarFile); gzFile = generateGZ(tarFile); testUri = new CrawleableUri(new URI("http://dice-research.org/squirrel/test")); diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicroformatParserTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicroformatParserTest.java index b867b8e67..02883f34c 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicroformatParserTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/analyzer/impl/MicroformatParserTest.java @@ -50,24 +50,14 @@ public class MicroformatParserTest extends RDFParserTest { private static UriCollector collector = new SimpleUriCollector(new GzipJavaUriSerializer()); private CrawleableUri curi; private static InMemorySink sink; - ClassLoader classLoader = getClass().getClassLoader(); public static Map> testresults = new HashMap>(); - -// static double[] truepositiv = new double[data().size()]; -// static double[] falsenegativ = new double[data().size()]; -// static double[] falsepositiv = new double[data().size()]; - - @BeforeClass - public static void initialization () throws URISyntaxException { - } - @Parameter(0) public String testData; @Parameter(1) public String resultData; @Rule public TestName test = new TestName(); - + @Parameters(name = "{index},{0},{1}") public static Collection data() { Object[][] data = new Object[][] { // "@context": {"@vocab": "http://www.w3.org/2006/vcard/ns#"}, From 740531cec68a39ee7a2e1b192d624df6ee47ca9e Mon Sep 17 00:00:00 2001 From: Geraldo Date: Thu, 24 Sep 2020 15:11:35 +0200 Subject: [PATCH 099/102] removed Singleton Pattern --- .../squirrel/analyzer/AbstractAnalyzer.java | 2 +- .../dice_research/squirrel/encoder/TripleEncoder.java | 11 ----------- .../squirrel/encoder/TripleEncoderTest.java | 2 +- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java index 20d1a1b8b..6da7bcbde 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/AbstractAnalyzer.java @@ -14,7 +14,7 @@ public abstract class AbstractAnalyzer implements Analyzer{ protected UriCollector collector; - protected TripleEncoder tripleEncoder = TripleEncoder.getInstance(); + protected TripleEncoder tripleEncoder = new TripleEncoder(); public AbstractAnalyzer(UriCollector collector) { this.collector = collector; diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java index a12fc6764..967a85bb4 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/encoder/TripleEncoder.java @@ -22,18 +22,7 @@ public class TripleEncoder { private static final Logger LOGGER = LoggerFactory.getLogger(TripleEncoder.class); - private static TripleEncoder tripleEncoder; - private TripleEncoder() { - - } - - public static final TripleEncoder getInstance() { - if (tripleEncoder == null) - tripleEncoder = new TripleEncoder(); - - return tripleEncoder; - } /** * diff --git a/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java index 6ec010a1a..9bd974121 100644 --- a/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java +++ b/squirrel.worker/src/test/java/org/dice_research/squirrel/encoder/TripleEncoderTest.java @@ -52,7 +52,7 @@ public void createUris() { @Test public void testEncoding() { - TripleEncoder encoder = TripleEncoder.getInstance(); + TripleEncoder encoder = new TripleEncoder(); for (int i = 0; i < 10; i++) { // System.out.println(encoder.encodeTriple(listUncodedTriples.get(i))); From 39847959a4c8a6f106199abc2c3d301e61bbb968 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Thu, 24 Sep 2020 15:13:26 +0200 Subject: [PATCH 100/102] Changed class name to CamelCase --- ...rontierquerygenerator.java => FrontierQueryGenerator.java} | 2 +- .../frontier/recrawling/SparqlBasedOutDatedUriRetriever.java | 2 +- .../dice_research/squirrel/frontier/impl/RecrawlingTest.java | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/{Frontierquerygenerator.java => FrontierQueryGenerator.java} (97%) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java similarity index 97% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java index 5c9eb3b81..cc5727af1 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/Frontierquerygenerator.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/FrontierQueryGenerator.java @@ -6,7 +6,7 @@ import java.text.SimpleDateFormat; import java.util.Calendar; -public class Frontierquerygenerator { +public class FrontierQueryGenerator { /** * Return outdated uris by comparing their endtime stamps. * @return All triples with time stamp in the default graph. diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java index a898ae79f..2fb595e0b 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/recrawling/SparqlBasedOutDatedUriRetriever.java @@ -93,7 +93,7 @@ public void setCredentials(AuthScope arg0, Credentials arg1) { public List getUriToRecrawl() { Calendar date = Calendar.getInstance(); date.add(Calendar.DAY_OF_YEAR, 7); - Query getOutdatedUrisQuery = Frontierquerygenerator.getOutdatedUrisQuery(date); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); while (rs.hasNext()) { diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java index 587064259..3955bf904 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/RecrawlingTest.java @@ -16,7 +16,7 @@ import org.apache.jena.query.ResultSet; import org.apache.jena.rdf.model.ModelFactory; import org.apache.jena.rdf.model.RDFNode; -import org.dice_research.squirrel.frontier.recrawling.Frontierquerygenerator; +import org.dice_research.squirrel.frontier.recrawling.FrontierQueryGenerator; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,7 +37,7 @@ public void recrawling(){ date.set(Calendar.MONTH, Calendar.JANUARY); date.set(Calendar.DAY_OF_MONTH, 3); date.set(Calendar.YEAR, 2020); - Query getOutdatedUrisQuery = Frontierquerygenerator.getOutdatedUrisQuery(date); + Query getOutdatedUrisQuery = FrontierQueryGenerator.getOutdatedUrisQuery(date); QueryExecution qe = queryExecFactory.createQueryExecution(getOutdatedUrisQuery); ResultSet rs = qe.execSelect(); assertTrue("There should be at least one result", rs.hasNext()); From bef9b45142489a8a8636d8319e5e234fa4bb9ce7 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Thu, 24 Sep 2020 15:39:47 +0200 Subject: [PATCH 101/102] camel case change --- .../squirrel/data/uri/filter/MongoDBKnowUriFilter.java | 4 ++-- ...dbconnectionfactory.java => MongodbConnectionFactory.java} | 2 +- .../squirrel/queue/ipbased/MongoDBIpBasedQueue.java | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) rename squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/{Mongodbconnectionfactory.java => MongodbConnectionFactory.java} (97%) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java index 3502d4b2e..0b79bf5ed 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java @@ -18,7 +18,7 @@ import org.dice_research.squirrel.deduplication.hashing.HashValue; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.impl.FrontierImpl; -import org.dice_research.squirrel.mongodb.Mongodbconnectionfactory; +import org.dice_research.squirrel.mongodb.MongodbConnectionFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +68,7 @@ public MongoDBKnowUriFilter(String hostName, Integer port) { LOGGER.info("Filter Persistance: " + PERSIST); - this.client = Mongodbconnectionfactory.getConnection(hostName, port); + this.client = MongodbConnectionFactory.getConnection(hostName, port); } diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongodbConnectionFactory.java similarity index 97% rename from squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java rename to squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongodbConnectionFactory.java index cb1f97c26..7bfc2ecf7 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/Mongodbconnectionfactory.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/mongodb/MongodbConnectionFactory.java @@ -13,7 +13,7 @@ * */ -public class Mongodbconnectionfactory { +public class MongodbConnectionFactory { /** * Returns a MongoClient based on host and port diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java index df0cea0f8..e05e26ea2 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/ipbased/MongoDBIpBasedQueue.java @@ -14,7 +14,7 @@ import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.serialize.Serializer; import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer; -import org.dice_research.squirrel.mongodb.Mongodbconnectionfactory; +import org.dice_research.squirrel.mongodb.MongodbConnectionFactory; import org.dice_research.squirrel.queue.AbstractIpAddressBasedQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,7 +63,7 @@ public MongoDBIpBasedQueue(String hostName, Integer port, Serializer serializer, this.serializer = serializer; - this.client = Mongodbconnectionfactory.getConnection(hostName, port); + this.client = MongodbConnectionFactory.getConnection(hostName, port); } From 5719c58587dff9333d639d979347eb947ef00790 Mon Sep 17 00:00:00 2001 From: Geraldo Date: Fri, 25 Sep 2020 18:18:23 +0200 Subject: [PATCH 102/102] changed filter --- .../squirrel/components/FrontierComponent.java | 2 +- .../frontier/impl/FrontierSenderToWebservice.java | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 2d9f5017c..71e22611f 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -148,7 +148,7 @@ public void init() throws Exception { if (webConfiguration.isCommunicationWithWebserviceEnabled()) { final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory, - workerGuard, queue, uriFilter, uriReferences); + workerGuard, queue, uriFilter.getKnownUriFilter(), uriReferences); LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to " + webConfiguration.isVisualizationOfCrawledGraphEnabled()); Thread senderThread = new Thread(sender); diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java index 68224c768..e57317f22 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java @@ -18,6 +18,7 @@ import org.apache.commons.io.IOUtils; import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; import org.dice_research.squirrel.data.uri.filter.UriFilterComposer; import org.dice_research.squirrel.data.uri.info.URIReferences; import org.dice_research.squirrel.data.uri.serialize.Serializer; @@ -41,7 +42,7 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { private final long startRunTime = System.currentTimeMillis(); private WorkerGuard workerGuard; private UriQueue queue; - private UriFilterComposer relationalUriFilter; + private KnownUriFilter knowUriFilter; private URIReferences uriReferences; private final static String WEB_QUEUE_GENERAL_NAME = "squirrel.web.in"; private RabbitQueueFactory factory; @@ -61,11 +62,11 @@ public class FrontierSenderToWebservice implements Runnable, Closeable { * @param knownUriFilter has information about the crawled URIs * @param uriReferences has information for the crawled graph. if it is {@code null}, the feature of creating a crawled graph is disabled */ - public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, UriFilterComposer relationalUriFilter, URIReferences uriReferences) { + public FrontierSenderToWebservice(RabbitQueueFactory factory, WorkerGuard workerGuard, UriQueue queue, KnownUriFilter knowUriFilter, URIReferences uriReferences) { this.factory = factory; this.workerGuard = workerGuard; this.queue = queue; - this.relationalUriFilter = relationalUriFilter; + this.knowUriFilter = knowUriFilter; this.uriReferences = uriReferences; } @@ -190,7 +191,7 @@ private SquirrelWebObject generateSquirrelWebObject() throws IllegalAccessExcept //Michael remarks, that's not a good idea to pass all crawled URIs, because that takes to much time... //newObject.setCrawledURIs(Collections.EMPTY_LIST); - newObject.setCountOfCrawledURIs((int) relationalUriFilter.getKnownUriFilter().count()); + newObject.setCountOfCrawledURIs((int) knowUriFilter.count()); return newObject; }