From 51d1c9cb62bba23310abdcf3fe0b52256784718c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Apr 2022 16:20:43 +0900 Subject: [PATCH 1/7] add env variables replacements for grobid and elasticsearch --- config/glutton.yml | 13 ++++++++----- .../lookup/web/LookupServiceApplication.java | 5 +++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/config/glutton.yml b/config/glutton.yml index 60a25381..644a45f2 100644 --- a/config/glutton.yml +++ b/config/glutton.yml @@ -18,7 +18,7 @@ indexBatchSize: 500 blockSize: 4 # Grobid server URL -grobidHost: http://localhost:8070/api +grobidHost: ${GROBID_URL:- http://localhost:8070/api} timeZone: UTC # the day hour for lauching the automatic daily incremental update, format is HH:MM @@ -27,7 +27,7 @@ dailyUpdateTime: 03:00 # a node of the elasticsearch cluster to be used and a name for the index elastic: #host: localhost:9200 - host: 0.0.0.0:9200 + host: ${ELASTIC_URL:- localhost:9200} index: crossref maxConnections: 20 @@ -39,7 +39,7 @@ proxy: crossref: # a directory where the crossref incremental update files (gap or daily update) will be located # to be changed according to your storage - dumpPath: /media/lopez/data2/crossref + dumpPath: ${DUMP_PATH:- /media/lopez/data2/crossref} # indicate if we remove the incremental files after they have been processed (value true) or if # keep them in the above dumpPath (careful the volume of files can be huge after months of daily update!) @@ -55,7 +55,7 @@ crossref: token: unpaywall: - dumpPath: + dumpPath: ${DUMP_PATH:- /media/lopez/data2/unpaywall} # a directory where the unpaywall update data feed change files will be located API_key: # API Key for the Unpaywall subscription is necessary to get the data feed change files for daily update @@ -88,13 +88,16 @@ logging: - type: console threshold: INFO timeZone: UTC +#Docker-ignore-log-start - type: file currentLogFilename: logs/lookup-service.log threshold: INFO archive: true - archivedLogFilenamePattern: logs/lookup-service-%d.log + archivedLogFilenamePattern: logs/lookup-service-%d-%i.log archivedFileCount: 5 timeZone: UTC + maxFileSize: 50MB +#Docker-ignore-log-end # the following is used only for pubmed related enrichments and extractions pubmed: diff --git a/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java b/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java index ad72cfc9..04fd4e50 100644 --- a/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java +++ b/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java @@ -19,6 +19,8 @@ import com.scienceminer.lookup.storage.StorageEnvFactory; import io.dropwizard.Application; +import io.dropwizard.configuration.EnvironmentVariableSubstitutor; +import io.dropwizard.configuration.SubstitutingSourceProvider; import io.dropwizard.forms.MultiPartBundle; import io.dropwizard.setup.Bootstrap; import io.dropwizard.setup.Environment; @@ -149,6 +151,9 @@ private List getGuiceModules() { @Override public void initialize(Bootstrap bootstrap) { + bootstrap.setConfigurationSourceProvider(new SubstitutingSourceProvider( + bootstrap.getConfigurationSourceProvider(), new EnvironmentVariableSubstitutor(false))); + GuiceBundle guiceBundle = GuiceBundle.defaultBuilder(LookupConfiguration.class) .modules(getGuiceModules()) .build(); From 5900885ee47ecc307268cf9371c60bd68a2d676b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Apr 2022 16:20:59 +0900 Subject: [PATCH 2/7] fixes in the docker file --- Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 46d390a5..5cdc1c41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ VOLUME /app/glutton-source/.gradle # source COPY lookup/ ./lookup/ COPY indexing/ ./indexing/ +RUN mkdir config RUN cd /app/glutton-source/lookup && ./gradlew clean assemble --no-daemon @@ -33,6 +34,7 @@ WORKDIR /app RUN apt-get update -qq && apt-get -y install nodejs npm COPY --from=builder /app/glutton-source/indexing /app/indexing +COPY --from=builder /app/glutton-source/config /app/lookup/config RUN cd indexing; npm install COPY --from=builder /app/glutton-source/lookup/build/distributions/lookup-service-shadow-*.zip ./lookup-service.zip @@ -44,6 +46,9 @@ RUN rm *.zip WORKDIR /app/lookup/lookup-service +RUN #sed -i '/#Docker-ignore-log-start/,/#Docker-ignore-log-end/d' data/config/config.yml + ENV JAVA_OPTS=-Xmx4g -CMD java -jar lib/lookup-service-0.2-SNAPSHOT-onejar.jar server data/config/config.yml +CMD ["./bin/lookup-service"] +#CMD java -jar lib/lookup-service-0.2-SNAPSHOT-onejar.jar server data/config/config.yml From b84ce9d04277d9fd4f08a1dc29facda341df0c00 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Apr 2022 16:23:27 +0900 Subject: [PATCH 3/7] some documentation --- Readme.md | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/Readme.md b/Readme.md index e8e3a3bc..1156d8d6 100644 --- a/Readme.md +++ b/Readme.md @@ -81,13 +81,26 @@ biblio-glutton takes advantage of GROBID for parsing raw bibliographical referen While GROBID is not required for running biblio-glutton, in particular if it is used only for bibliographical look-up, it is recommended for performing bibliographical record matching. - ### REST API From 1611458ca885d23cfc81e78178a93f23fb8aa304 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Apr 2022 09:24:42 +0900 Subject: [PATCH 4/7] update docker-compose and documentation including the data loading --- Dockerfile | 1 - Readme.md | 49 ++++++++++--- config/glutton-docker-compose.yml | 111 ++++++++++++++++++++++++++++++ docker-compose.yml | 29 ++++++-- 4 files changed, 175 insertions(+), 15 deletions(-) create mode 100644 config/glutton-docker-compose.yml diff --git a/Dockerfile b/Dockerfile index 5cdc1c41..8308f610 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,4 +51,3 @@ RUN #sed -i '/#Docker-ignore-log-start/,/#Docker-ignore-log-end/d' data/config/ ENV JAVA_OPTS=-Xmx4g CMD ["./bin/lookup-service"] -#CMD java -jar lib/lookup-service-0.2-SNAPSHOT-onejar.jar server data/config/config.yml diff --git a/Readme.md b/Readme.md index 1156d8d6..eb70929e 100644 --- a/Readme.md +++ b/Readme.md @@ -98,6 +98,32 @@ The docker image does not start without a valid configuration file, this is done docker run -v `pwd`/config:/app/lookup/config -it lfoppiano/biblio-glutton-lookup:0.2 ``` +If elasticsearch (and, perhaps Grobid) run on the same host machine, you can reach it from within Docker by adding the parameter `--add-host=host.docker.internal:host-gateway` and setting `host.docker.internal` in the configuration file. + +##### Data load + +Elasticsearch can be loaded by pointing directly where it is deployed. + +To load LMDB data + +Run the service by mounting the `/data` directory as a volume: +``` +docker run -v `pwd`/config:/app/lookup/config -v `pwd`/data:/app/data -it lfoppiano/biblio-glutton-lookup:0.2 +``` + +Find the hash corresponding to the container: + +``` +docker ps +``` + +Execute the loading process: +``` +docker exec edfd57a6a7cf java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml +``` + +You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources). + #### Docker compose A Docker Compose file is included to make it easier to spin up biblio-glutton, Elasticsearch, and GROBID. @@ -112,13 +138,16 @@ You can run this command to see aggregated log output: Once everything has booted up, biblio-glutton will be running at http://localhost:8080 and GROBID will be at http://localhost:8070. -To load data, you can use the `docker-compose run` command. The `data/` directory is mounted inside the container. -For example, this command will load Crossref data (as described in more detail [below](https://github.com/kermitt2/biblio-glutton#resources)): +##### Data load + +Elasticsearch can be loaded by pointing directly to `localhost:9200` - $ docker-compose run biblio java -jar lib/lookup-service-0.2-onejar.jar crossref --input ../../data/crossref-works.2018-09-05.json.xz config/glutton.yml +To load LMDB data, you can use the `docker-compose run` command. The `data/` directory is mounted inside the container. +For example, this command will load Crossref data (as described in more detail [below](https://github.com/kermitt2/biblio-glutton#resources)): -You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources). + $ docker-compose run biblio java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml +You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources). ### REST API @@ -276,11 +305,11 @@ One glutton instance: 19,792,280 DOI lookup in 3156 seconds, ~ 6270 queries per Processing time for matching 17,015 raw bibliographical reference strings to DOI: -| number of ES cluster nodes | comment | total runtime (second) | runtime per bib. ref. (second) | queries per second | -|----|---|---|---|---| -| 1 | glutton and Elasticsearch node share the same machine | 2625 | 0.154 | 6.5 | -| 1 | glutton and Elasticsearch node on two separate machines | 1990 | 0.117 | 8.5 | -| 2 | glutton and one of the Elasticsearch node sharing the same machine | 1347 | 0.079 | 12.6 | +| number of ES cluster nodes | comment | total runtime (second) | runtime per bib. ref. (second) | queries per second | +|----------------------------|--------------------------------------------------------------------|------------------------|--------------------------------|--------------------| +| 1 | glutton and Elasticsearch node share the same machine | 2625 | 0.154 | 6.5 | +| 1 | glutton and Elasticsearch node on two separate machines | 1990 | 0.117 | 8.5 | +| 2 | glutton and one of the Elasticsearch node sharing the same machine | 1347 | 0.079 | 12.6 | Machines have the same configuration Intel i7 4-cores, 8 threads, 16GB memory, SSD, on Ubuntu 16.04. @@ -540,7 +569,7 @@ We created a dataset of [17,015 bibliographical reference/DOI pairs](doc/referen Example of the two first of the 17.015 entries: -```json +``` {"reference": "Classen M, Demling L. Endoskopishe shinkterotomie der papilla \nVateri und Stein extraction aus dem Duktus Choledochus [Ger-\nman]. Dtsch Med Wochenschr. 1974;99:496-7.", "doi": "10.1055/s-0028-1107790", "pmid": "4835515", "atitle": "Endoskopishe shinkterotomie der papilla Vateri und Stein extraction aus dem Duktus Choledochus [German]", "firstAuthor": "Classen", "jtitle": "Dtsch Med Wochenschr", "volume": "99", "firstPage": "496"}, {"reference": "Kawai K, Akasaka Y, Murakami K. Endoscopic sphincterotomy \nof the ampulla of Vater. Gastrointest Endosc. 1974;20:148-51.", "doi": "10.1016/S0016-5107(74)73914-1", "pmid": "4825160", "atitle": "Endoscopic sphincterotomy of the ampulla of Vater", "firstAuthor": "Kawai", "jtitle": "Gastrointest Endosc", "volume": "20", "firstPage": "148"}, ``` diff --git a/config/glutton-docker-compose.yml b/config/glutton-docker-compose.yml new file mode 100644 index 00000000..65986204 --- /dev/null +++ b/config/glutton-docker-compose.yml @@ -0,0 +1,111 @@ +version: 0.2 + +# where the metadata are stored, it takes more than 200GB for all Crossref, Unpaywall, PubMed and ISTEX mappings +storage: data/db +#storage: /media/lopez/T5/data/db + +# Crossref fields to be ignored when storing metadata, reference field in particular take a lot of space +ignoreCrossRefFields: + - reference + - abstract + - indexed + +# batch size for preparing the data +loadingBatchSize: 10000 +indexBatchSize: 500 + +# max blocking size (number of candidates considered for pairwise matching) +blockSize: 4 + +# Grobid server URL +grobidHost: ${GROBID_URL:- http://grobid:8070/api} + +timeZone: UTC +# the day hour for lauching the automatic daily incremental update, format is HH:MM +dailyUpdateTime: 03:00 + +# a node of the elasticsearch cluster to be used and a name for the index +elastic: + #host: localhost:9200 + host: ${ELASTIC_URL:- elasticsearch:9200} + index: crossref + maxConnections: 20 + +proxy: + # proxy to be used when doing external call to crossref or unpaywall + host: + port: + +crossref: + # a directory where the crossref incremental update files (gap or daily update) will be located + # to be changed according to your storage + dumpPath: ${DUMP_PATH:- /media/lopez/data2/crossref} + + # indicate if we remove the incremental files after they have been processed (value true) or if + # keep them in the above dumpPath (careful the volume of files can be huge after months of daily update!) + cleanProcessFiles: true + + # for the crossref REST API and daily update, you need normally to use it politely and to indicate an email + #address here, e.g. + #mailto: "toto@titi.tutu" + mailto: + + # to use Crossref metadata plus service (available by subscription) + #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere" + token: + +unpaywall: + dumpPath: ${DUMP_PATH:- /media/lopez/data2/unpaywall} + # a directory where the unpaywall update data feed change files will be located + API_key: + # API Key for the Unpaywall subscription is necessary to get the data feed change files for daily update + +# CORS parameters +corsAllowedOrigins: "*" +corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" +corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" + +# beyond the following number of requests, a 503 status will be returned (service unavailable) until enough +# requests are processed to go beyond the max +maxAcceptedRequests: 2048 + +server: + type: custom + applicationConnectors: + - type: http + port: 8080 + adminConnectors: + - type: http + port: 8081 + registerDefaultExceptionMappers: false + maxThreads: 2048 + maxQueuedRequests: 2048 + acceptQueueSize: 2048 + +logging: + level: INFO + appenders: + - type: console + threshold: INFO + timeZone: UTC +#Docker-ignore-log-start + - type: file + currentLogFilename: logs/lookup-service.log + threshold: INFO + archive: true + archivedLogFilenamePattern: logs/lookup-service-%d-%i.log + archivedFileCount: 5 + timeZone: UTC + maxFileSize: 50MB +#Docker-ignore-log-end + +# the following is used only for pubmed related enrichments and extractions +pubmed: + # path to the medline resources, to be changed according to your storage + pubmedDirectory: /media/lopez/data/biblio/medline2021/ + # path to PMC mapping data + pmcDirectory: data/pmc + # elasticsearch index for pubmed, used to create extraction based on MeSH terms + index: pubmed + # path to the working pubmed databases, to be changed according to your storage + dbDirectory: /media/lopez/T5/data2/db diff --git a/docker-compose.yml b/docker-compose.yml index 8252c57f..90b4e811 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,24 +1,45 @@ -version: "2" +version: "3.9" services: biblio: - build: . + image: lfoppiano/biblio-glutton-lookup:0.2 restart: always ports: - 8080:8080 volumes: - ./data:/app/data + - ./config/glutton-docker-compose.yml:/app/lookup/config/glutton.yml:ro + networks: + - net1 + grobid: - image: lfoppiano/grobid:0.5.5 + image: lfoppiano/grobid:0.7.1 restart: always ports: - 8070:8070 + networks: + net1: + aliases: + - "grobid.local" + elasticsearch: - image: elasticsearch:6.7.1 + image: elasticsearch:7.17.3 environment: - "discovery.type=single-node" + ports: + - 9200:9200 + - 9300:9300 volumes: - elasticsearch-data:/usr/share/elasticsearch/data restart: always + networks: + net1: + aliases: + - "elasticsearch.local" + volumes: elasticsearch-data: driver: local + + +networks: + net1: \ No newline at end of file From 513fcf0d5edb1e133df2450992704f8b4fd60dfd Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Apr 2022 09:45:20 +0900 Subject: [PATCH 5/7] update documentation with the latest news --- Readme.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/Readme.md b/Readme.md index eb70929e..41632b2b 100644 --- a/Readme.md +++ b/Readme.md @@ -95,15 +95,25 @@ The config file has many possible changes therefore we recommend to mount a volu The docker image does not start without a valid configuration file, this is done explicitly to avoid starting it without having a configuration file specific for docker ``` -docker run -v `pwd`/config:/app/lookup/config -it lfoppiano/biblio-glutton-lookup:0.2 +docker run -v /my/disk/path/config:/app/lookup/config -v /my/disk/path/data:/app/data -it lfoppiano/biblio-glutton-lookup:0.2 ``` -If elasticsearch (and, perhaps Grobid) run on the same host machine, you can reach it from within Docker by adding the parameter `--add-host=host.docker.internal:host-gateway` and setting `host.docker.internal` in the configuration file. +If elasticsearch (and, perhaps Grobid) run on the same host machine, you can reach it from within Docker by adding the parameter `--add-host=host.docker.internal:host-gateway` and setting `host.docker.internal:9200` in the configuration file. +**NOTE**: make sure you enable elasticsearch to listen on interface `172.17.0.1` which is the one resolving `host.docker.internal` in a normal docker installation. +At the time of writing this, I was using version 7.x, which needs the following parameters: +`` +discovery.seed_hosts: ["localhost","172.17.0.1"] +cluster.initial_master_nodes: ['my_cluster'] +`` + +Check with the manual of the version you're deploying for more and updated information. + ##### Data load Elasticsearch can be loaded by pointing directly where it is deployed. +**To be tested** To load LMDB data Run the service by mounting the `/data` directory as a volume: @@ -138,9 +148,14 @@ You can run this command to see aggregated log output: Once everything has booted up, biblio-glutton will be running at http://localhost:8080 and GROBID will be at http://localhost:8070. +**NOTE**: The docker-compose.yml file contains aliases `*.local`. +This are made for the unfortunate people that are behind a proxy. +You could just exclude the hosts `*.local` from the proxy wraths in the docker configuration. + ##### Data load -Elasticsearch can be loaded by pointing directly to `localhost:9200` +**To be tested** +Elasticsearch can be loaded by pointing directly to `localhost:9200`, which is bound on the host machine at the port 9200. To load LMDB data, you can use the `docker-compose run` command. The `data/` directory is mounted inside the container. For example, this command will load Crossref data (as described in more detail [below](https://github.com/kermitt2/biblio-glutton#resources)): From 4a8bdc76815cd5ac1d61e42824329b5ecef78e70 Mon Sep 17 00:00:00 2001 From: steppo83 Date: Wed, 28 Sep 2022 09:23:05 +0200 Subject: [PATCH 6/7] Readme updated adding git command The arg --config core.autocrlf=input is needed for windows users, otherwise they'll have strange errors during docker-compose up phase. --- Readme.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Readme.md b/Readme.md index 41632b2b..377dae05 100644 --- a/Readme.md +++ b/Readme.md @@ -81,6 +81,11 @@ biblio-glutton takes advantage of GROBID for parsing raw bibliographical referen While GROBID is not required for running biblio-glutton, in particular if it is used only for bibliographical look-up, it is recommended for performing bibliographical record matching. +### Windows users + +When you clone the repo, remember to add --config core.autocrlf=input to the git command otherwise you'll have strange errors during docker-compose up phase, so it will be: + +git clone https://github.com/kermitt2/biblio-glutton.git --config core.autocrlf=input ### Running with Docker From 6b8dd1d15d05ff6b1110561b41704d3a15a3d2db Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 29 Sep 2022 15:25:30 +0900 Subject: [PATCH 7/7] small cosmetics --- Readme.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Readme.md b/Readme.md index 377dae05..a63bbf87 100644 --- a/Readme.md +++ b/Readme.md @@ -33,6 +33,12 @@ Once the databases and index are built, the bibliographical REST API can be star You need Java JDK 1.8 installed for building and running the tool. +**NOTE**: Windows users should add `--config core.autocrlf=input` or configure at system level: +``` +git clone https://github.com/kermitt2/biblio-glutton.git --config core.autocrlf=input +``` + + ```sh cd lookup ./gradlew clean build @@ -81,12 +87,6 @@ biblio-glutton takes advantage of GROBID for parsing raw bibliographical referen While GROBID is not required for running biblio-glutton, in particular if it is used only for bibliographical look-up, it is recommended for performing bibliographical record matching. -### Windows users - -When you clone the repo, remember to add --config core.autocrlf=input to the git command otherwise you'll have strange errors during docker-compose up phase, so it will be: - -git clone https://github.com/kermitt2/biblio-glutton.git --config core.autocrlf=input - ### Running with Docker Biblio-glutton provides a [Docker](https://docs.docker.com/install/) image and a docker-composed file. @@ -134,7 +134,7 @@ docker ps Execute the loading process: ``` -docker exec edfd57a6a7cf java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml +docker exec CONTAINER_HASH java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml ``` You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources).