Skip to content

Commit

Permalink
Merge pull request #145 from dice-group/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
MichaelRoeder authored Sep 25, 2020
2 parents 7c0c787 + 5719c58 commit 324a1a3
Show file tree
Hide file tree
Showing 120 changed files with 2,507 additions and 1,646 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ target
*.log
*.class
.classpath
.factorypath
.project
.settings
.idea
Expand Down
79 changes: 44 additions & 35 deletions docker-compose-sparql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ services:
environment:
- HOBBIT_RABBIT_HOST=rabbit
- URI_WHITELIST_FILE=/var/squirrel/whitelist.txt
- FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context.xml
- SEED_FILE=/var/squirrel/seeds.csv
- FRONTIER_CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/frontier-context-sparql.xml
- SEED_FILE=/var/squirrel/seeds.txt
- SPARQL_URL=http://virtuosohost:8890/sparql-auth/
- SPARQL_HOST_USER=dba
- SPARQL_HOST_PASSWD=pw123
- MDB_HOST_NAME=mongodb
- MDB_PORT=27017
- MDB_CONNECTION_TIME_OUT=5000
Expand All @@ -42,14 +45,14 @@ services:
- Driver=/usr/local/lib/virtodbc_32.so
- DBA_PASSWORD=pw123

# sparqlhost:
# image: stain/jena-fuseki
# container_name: sparqlhost
# ports:
# - "3030:3030"
# environment:
# - ADMIN_PASSWORD=pw123
# - JVM_ARGS=-Xmx2g
# sparqlhost:
# image: stain/jena-fuseki
# container_name: sparqlhost
# ports:
# - "3030:3030"
# environment:
# - ADMIN_PASSWORD=pw123
# - JVM_ARGS=-Xmx2g

mongodb:
image: mongo:4.0.0
Expand All @@ -75,7 +78,7 @@ services:
- "8081:15672"
# Forwarding the port for testing
- "5672:5672"

worker1:
image: dicegroup/squirrel.worker:latest
container_name: worker1
Expand All @@ -91,9 +94,11 @@ services:
- JVM_ARGS=-Xmx8g
- STORE_METADATA=true
volumes:
- ./data/worker1:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
- ./data/worker1:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter


worker2:
image: dicegroup/squirrel.worker:latest
Expand All @@ -110,9 +115,11 @@ services:
- JVM_ARGS=-Xmx8g
- STORE_METADATA=true
volumes:
- ./data/worker2:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
- ./data/worker2:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter


worker3:
image: dicegroup/squirrel.worker:latest
Expand All @@ -129,22 +136,24 @@ services:
- JVM_ARGS=-Xmx8g
- STORE_METADATA=true
volumes:
- ./data/worker3:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
- ./data/worker3:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter

deduplicator:
image: squirrel
container_name: deduplicator
environment:
DEDUPLICATION_ACTIVE: "true"
HOBBIT_RABBIT_HOST: rabbit
OUTPUT_FOLDER: /var/squirrel/data
MDB_HOST_NAME: mongodb
MDB_PORT: 27017
SPARQL_HOST_NAME: sparqlhost
SPARQL_HOST_PORT: 3030
SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
volumes:
- ./data/deduplicator:/var/squirrel/data
command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent

# deduplicator:
# image: squirrel.deduplication
# container_name: deduplicator
# environment:
# DEDUPLICATION_ACTIVE: "true"
# HOBBIT_RABBIT_HOST: rabbit
# OUTPUT_FOLDER: /var/squirrel/data
# CONTEXT_CONFIG_FILE: /var/squirrel/spring-config/context-deduplicator.xml
# MDB_HOST_NAME: mongodb
# MDB_PORT: 27017
# SPARQL_HOST_NAME: sparqlhost
# SPARQL_HOST_PORT: 3030
# SERVICE_PRECONDITION: "mongodb:27017 rabbit:5672"
# volumes:
# - ./data/deduplicator:/var/squirrel/data
30 changes: 15 additions & 15 deletions docker-compose-web.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,18 @@ services:
#- ./whitelist/ckanwhitelist.txt:/var/squirrel/ckanwhitelist.txt:ro
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter

deduplicator:
image: squirrel
container_name: deduplicator
environment:
DEDUPLICATION_ACTIVE: "true"
HOBBIT_RABBIT_HOST: rabbit
OUTPUT_FOLDER: /var/squirrel/data
MDB_HOST_NAME: mongodb
MDB_PORT: 27017
SPARQL_HOST_NAME: sparqlhost
SPARQL_HOST_PORT: 3030
SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
volumes:
- ./data/deduplicator:/var/squirrel/data
command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.DeduplicatorComponent
# deduplicator:
# image: squirrel
# container_name: deduplicator
# environment:
# DEDUPLICATION_ACTIVE: "true"
# HOBBIT_RABBIT_HOST: rabbit
# OUTPUT_FOLDER: /var/squirrel/data
# MDB_HOST_NAME: mongodb
# MDB_PORT: 27017
# SPARQL_HOST_NAME: sparqlhost
# SPARQL_HOST_PORT: 3030
# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
# volumes:
# - ./data/deduplicator:/var/squirrel/data
# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.DeduplicatorComponent
32 changes: 17 additions & 15 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ services:
- ./seed/seeds.csv:/var/squirrel/seeds.csv:ro
- ./seed/seeds.txt:/var/squirrel/seeds.txt:ro
- ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro
command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter


mongodb:
image: mongo:4.0.0
Expand Down Expand Up @@ -108,18 +110,18 @@ services:
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config

deduplicator:
image: squirrel
container_name: deduplicator
environment:
DEDUPLICATION_ACTIVE: "true"
HOBBIT_RABBIT_HOST: rabbit
OUTPUT_FOLDER: /var/squirrel/data
MDB_HOST_NAME: mongodb
MDB_PORT: 27017
SPARQL_HOST_NAME: sparqlhost
SPARQL_HOST_PORT: 3030
SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
volumes:
- ./data/deduplicator:/var/squirrel/data
command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent
# deduplicator:
# image: squirrel
# container_name: deduplicator
# environment:
# DEDUPLICATION_ACTIVE: "true"
# HOBBIT_RABBIT_HOST: rabbit
# OUTPUT_FOLDER: /var/squirrel/data
# MDB_HOST_NAME: mongodb
# MDB_PORT: 27017
# SPARQL_HOST_NAME: sparqlhost
# SPARQL_HOST_PORT: 3030
# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
# volumes:
# - ./data/deduplicator:/var/squirrel/data
# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent
12 changes: 6 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.dice-research</groupId>
<artifactId>squirrel</artifactId>
<version>0.5.0</version>
<version>0.4.0</version>
<packaging>pom</packaging>
<inceptionYear>2017</inceptionYear>
<name>Squirrel</name>
Expand Down Expand Up @@ -60,12 +60,12 @@
<repository>
<id>maven.aksw.internal</id>
<name>AKSW Internal Release Repository</name>
<url>http://maven.aksw.org/repository/internal/</url>
<url>https://maven.aksw.org/repository/internal/</url>
</repository>
<repository>
<id>maven.aksw.snapshots</id>
<name>University Leipzig, AKSW Maven2 Repository</name>
<url>http://maven.aksw.org/repository/snapshots</url>
<url>https://maven.aksw.org/repository/snapshots</url>
</repository>
<repository>
<id>spring-releases</id>
Expand Down Expand Up @@ -345,11 +345,11 @@
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- System rules for setting environment variables -->
<!-- System Lambda for setting environment variables -->
<dependency>
<groupId>com.github.stefanbirkner</groupId>
<artifactId>system-rules</artifactId>
<scope>test</scope>
<artifactId>system-lambda</artifactId>
<version>1.0.0</version>
</dependency>
<!-- ~~~~~~~~~~~~~~~~~~~ End Testing ~~~~~~~~~~~~~~~~~~~~~~ -->

Expand Down
56 changes: 56 additions & 0 deletions spring-config/frontier-context-sparql.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<beans xmlns:context="http://www.springframework.org/schema/context"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.springframework.org/schema/beans"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context.xsd">



<context:component-scan
base-package="org.dice_research.squirrel" />

<!-- This will automatically locate any and all property files you have
within your classpath, provided they fall under the META-INF directory. The
located property files are parsed and their values can then be used within
application context files in the form of ${propertyKey}. -->



<!-- <bean id="workerImpl" class="org.aksw.simba.squirrel.worker.impl.WorkerImpl"></bean> -->
<!--
<bean id="frontierBean"
class="org.dice_research.squirrel.frontier.impl.ExtendedFrontierImpl">
<constructor-arg index="0" ref="normalizerImpl" />
<constructor-arg index="1" ref="knowUriFilterBean" />
<constructor-arg index="2">
<null />
</constructor-arg>
<constructor-arg index="3" ref="queueBean" />
<constructor-arg index="4" value="true" />
</bean> -->

<bean id="serializerBean"
class="org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer"/>

<bean id = "queueBean" class="org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue">
<constructor-arg index="0" value="#{systemEnvironment['MDB_HOST_NAME']}" />
<constructor-arg index="1" value="#{systemEnvironment['MDB_PORT']}" />
<constructor-arg index="2" ref="serializerBean" />
</bean>

<bean id = "knowUriFilterBean" class="org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter">
<constructor-arg index="0" value="#{systemEnvironment['MDB_HOST_NAME']}" />
<constructor-arg index="1" value="#{systemEnvironment['MDB_PORT']}" />
</bean>
<!-- Triple Store sparql implementation -->
<bean id="sparqlBean" class="org.dice_research.squirrel.frontier.recrawling.SparqlBasedOutDatedUriRetriever">
<constructor-arg index="0" value="#{systemEnvironment['SPARQL_URL']}"/>
<constructor-arg index="1" value="#{systemEnvironment['SPARQL_HOST_USER']}"/>
<constructor-arg index="2" value="#{systemEnvironment['SPARQL_HOST_PASSWD']}"/>
</bean>


</beans>
Loading

0 comments on commit 324a1a3

Please sign in to comment.