kermitt2 · lfoppiano · Apr 22, 2022 · Apr 22, 2022 · Apr 22, 2022 · Apr 26, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -19,6 +19,7 @@ VOLUME /app/glutton-source/.gradle
 # source
 COPY lookup/ ./lookup/
 COPY indexing/ ./indexing/
+RUN mkdir config
 
 RUN cd /app/glutton-source/lookup && ./gradlew clean assemble --no-daemon
 
@@ -33,6 +34,7 @@ WORKDIR /app
 
 RUN apt-get update -qq && apt-get -y install nodejs npm
 COPY --from=builder /app/glutton-source/indexing /app/indexing
+COPY --from=builder /app/glutton-source/config /app/lookup/config
 RUN cd indexing; npm install
 
 COPY --from=builder /app/glutton-source/lookup/build/distributions/lookup-service-shadow-*.zip ./lookup-service.zip
@@ -44,6 +46,8 @@ RUN rm *.zip
 
 WORKDIR /app/lookup/lookup-service
 
+RUN #sed -i '/#Docker-ignore-log-start/,/#Docker-ignore-log-end/d'  data/config/config.yml
+
 ENV JAVA_OPTS=-Xmx4g
 
-CMD java -jar lib/lookup-service-0.2-SNAPSHOT-onejar.jar server data/config/config.yml
+CMD ["./bin/lookup-service"]
diff --git a/Readme.md b/Readme.md
@@ -33,6 +33,12 @@ Once the databases and index are built, the bibliographical REST API can be star
 
 You need Java JDK 1.8 installed for building and running the tool. 
 
+**NOTE**: Windows users should add `--config core.autocrlf=input` or configure at system level:
+```
+git clone https://github.com/kermitt2/biblio-glutton.git --config core.autocrlf=input
+```
+
+
 ```sh
 cd lookup
 ./gradlew clean build
@@ -81,13 +87,61 @@ biblio-glutton takes advantage of GROBID for parsing raw bibliographical referen
 
 While GROBID is not required for running biblio-glutton, in particular if it is used only for bibliographical look-up, it is recommended for performing bibliographical record matching. 
 
-<!--- 
-
 ### Running with Docker
 
-A Docker Compose file is included to make it easier to spin up biblio-glutton, Elasticsearch, and GROBID.
+Biblio-glutton provides a [Docker](https://docs.docker.com/install/) image and a docker-composed file. 
+We recommend to use docker-compose as a way to test and play with the biblio-glutton service, because all the service components are bundled into one container. It might also fit simple needs.
+However, it is not a solution for scaling and deploying a service requiring high performance bibliographic matching, see [this section](https://github.com/kermitt2/biblio-glutton#building-the-bibliographical-data-look-up-and-matching-databases) for more information.
+
+#### Docker image
+
+The docker image can be deployed by use the instance of grobid and elastic deployed either in the local machine or elsewhere. '
+The config file has many possible changes therefore we recommend to mount a volume that point to a local modified version. 
+The docker image does not start without a valid configuration file, this is done explicitly to avoid starting it without having a configuration file specific for docker
+
+```
+docker run -v /my/disk/path/config:/app/lookup/config -v /my/disk/path/data:/app/data -it lfoppiano/biblio-glutton-lookup:0.2
+```
+
+If elasticsearch (and, perhaps Grobid) run on the same host machine, you can reach it from within Docker by adding the parameter `--add-host=host.docker.internal:host-gateway` and setting `host.docker.internal:9200` in the configuration file.
+**NOTE**: make sure you enable elasticsearch to listen on interface `172.17.0.1` which is the one resolving `host.docker.internal` in a normal docker installation. 
+At the time of writing this, I was using version 7.x, which needs the following parameters: 
+``
+discovery.seed_hosts: ["localhost","172.17.0.1"]
+cluster.initial_master_nodes: ['my_cluster']
+``
+
+Check with the manual of the version you're deploying for more and updated information. 
+
+
+##### Data load
+
+Elasticsearch can be loaded by pointing directly where it is deployed.
+
+**To be tested**
+To load LMDB data 
 
-First, [install Docker](https://docs.docker.com/install/).
+Run the service by mounting the `/data` directory as a volume:
+```
+docker run -v `pwd`/config:/app/lookup/config -v `pwd`/data:/app/data -it lfoppiano/biblio-glutton-lookup:0.2
+```
+
+Find the hash corresponding to the container: 
+
+```
+docker ps 
+```
+
+Execute the loading process:
+```
+docker exec  CONTAINER_HASH java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml
+```
+
+You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources).
+
+#### Docker compose 
+
+A Docker Compose file is included to make it easier to spin up biblio-glutton, Elasticsearch, and GROBID.
 
 Then, run this command to spin everything up:
 
@@ -99,15 +153,22 @@ You can run this command to see aggregated log output:
 
 Once everything has booted up, biblio-glutton will be running at http://localhost:8080 and GROBID will be at http://localhost:8070.
 
-To load data, you can use the `docker-compose run` command. The `data/` directory is mounted inside the container. For example, this command will load Crossref data (as described in more detail [below](https://github.com/kermitt2/biblio-glutton#resources)):
+**NOTE**: The docker-compose.yml file contains aliases `*.local`.
+This are made for the unfortunate people that are behind a proxy.
+You could just exclude the hosts `*.local` from the proxy wraths in the docker configuration.
 
-    $ docker-compose run biblio java -jar lib/lookup-service-0.2-onejar.jar crossref --input ../../data/crossref-works.2018-09-05.json.xz config/glutton.yml
+##### Data load 
 
-You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources). 
+**To be tested**
+Elasticsearch can be loaded by pointing directly to `localhost:9200`, which is bound on the host machine at the port 9200.  
 
-__Important Note__: this Docker is a way to test and play with the biblio-glutton service, because all the service components are bundled into one container. It might also fit simple needs. However, it is not a solution for scaling and deploying a service requiring high performance bibliographic matching, see [this section](https://github.com/kermitt2/biblio-glutton#building-the-bibliographical-data-look-up-and-matching-databases) for more information. 
+To load LMDB data, you can use the `docker-compose run` command. The `data/` directory is mounted inside the container. 
+For example, this command will load Crossref data (as described in more detail [below](https://github.com/kermitt2/biblio-glutton#resources)):
+
+  $ docker-compose run biblio java -jar lib/lookup-service-0.2-onejar.jar crossref --input /app/data/crossref-works.2018-09-05.json.xz /app/lookup/config/glutton.yml
+
+You will need to load similarly the other resources, as detailed [here](https://github.com/kermitt2/biblio-glutton#resources).
 
--->
 
 ### REST API
 
@@ -264,11 +325,11 @@ One glutton instance: 19,792,280 DOI lookup in 3156 seconds, ~ 6270 queries per
 
 Processing time for matching 17,015 raw bibliographical reference strings to DOI:
 
-| number of ES cluster nodes | comment  | total runtime (second) | runtime per bib. ref. (second)   | queries per second |
-|----|---|---|---|---|
-|  1 | glutton and Elasticsearch node share the same machine   | 2625  | 0.154  |  6.5  |
-|  1 | glutton and Elasticsearch node on two separate machines   | 1990  | 0.117  |  8.5 |
-|  2 | glutton and one of the Elasticsearch node sharing the same machine  |  1347  |  0.079  | 12.6  |
+| number of ES cluster nodes | comment                                                            | total runtime (second) | runtime per bib. ref. (second) | queries per second |
+|----------------------------|--------------------------------------------------------------------|------------------------|--------------------------------|--------------------|
+| 1                          | glutton and Elasticsearch node share the same machine              | 2625                   | 0.154                          | 6.5                |
+| 1                          | glutton and Elasticsearch node on two separate machines            | 1990                   | 0.117                          | 8.5                |
+| 2                          | glutton and one of the Elasticsearch node sharing the same machine | 1347                   | 0.079                          | 12.6               |
 
 Machines have the same configuration Intel i7 4-cores, 8 threads, 16GB memory, SSD, on Ubuntu 16.04.
 
@@ -528,7 +589,7 @@ We created a dataset of [17,015 bibliographical reference/DOI pairs](doc/referen
 
 Example of the two first of the 17.015 entries: 
 
-```json
+```
 {"reference": "Classen M, Demling L. Endoskopishe shinkterotomie der papilla \nVateri und Stein extraction aus dem Duktus Choledochus [Ger-\nman]. Dtsch Med Wochenschr. 1974;99:496-7.", "doi": "10.1055/s-0028-1107790", "pmid": "4835515", "atitle": "Endoskopishe shinkterotomie der papilla Vateri und Stein extraction aus dem Duktus Choledochus [German]", "firstAuthor": "Classen", "jtitle": "Dtsch Med Wochenschr", "volume": "99", "firstPage": "496"},
 {"reference": "Kawai K, Akasaka Y, Murakami K. Endoscopic sphincterotomy \nof the ampulla of Vater. Gastrointest Endosc. 1974;20:148-51.", "doi": "10.1016/S0016-5107(74)73914-1", "pmid": "4825160", "atitle": "Endoscopic sphincterotomy of the ampulla of Vater", "firstAuthor": "Kawai", "jtitle": "Gastrointest Endosc", "volume": "20", "firstPage": "148"},
 ```

diff --git a/config/glutton-docker-compose.yml b/config/glutton-docker-compose.yml
@@ -0,0 +1,111 @@
+version: 0.2
+
+# where the metadata are stored, it takes more than 200GB for all Crossref, Unpaywall, PubMed and ISTEX mappings 
+storage: data/db
+#storage: /media/lopez/T5/data/db
+
+# Crossref fields to be ignored when storing metadata, reference field in particular take a lot of space
+ignoreCrossRefFields: 
+  - reference
+  - abstract
+  - indexed
+
+# batch size for preparing the data
+loadingBatchSize: 10000
+indexBatchSize: 500
+
+# max blocking size (number of candidates considered for pairwise matching)
+blockSize: 4
+
+# Grobid server URL
+grobidHost: ${GROBID_URL:- http://grobid:8070/api}
+
+timeZone: UTC
+# the day hour for lauching the automatic daily incremental update, format is HH:MM
+dailyUpdateTime: 03:00
+
+# a node of the elasticsearch cluster to be used and a name for the index
+elastic:
+  #host: localhost:9200
+  host: ${ELASTIC_URL:- elasticsearch:9200}
+  index: crossref
+  maxConnections: 20
+
+proxy:
+    # proxy to be used when doing external call to crossref or unpaywall
+    host: 
+    port: 
+
+crossref:
+  # a directory where the crossref incremental update files (gap or daily update) will be located
+  # to be changed according to your storage
+  dumpPath: ${DUMP_PATH:- /media/lopez/data2/crossref}
+
+  # indicate if we remove the incremental files after they have been processed (value true) or if
+  # keep them in the above dumpPath (careful the volume of files can be huge after months of daily update!)
+  cleanProcessFiles: true
+
+  # for the crossref REST API and daily update, you need normally to use it politely and to indicate an email 
+  #address here, e.g. 
+  #mailto: "[email protected]"
+  mailto: 
+
+  # to use Crossref metadata plus service (available by subscription)
+  #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
+  token:
+
+unpaywall:
+  dumpPath: ${DUMP_PATH:- /media/lopez/data2/unpaywall}
+  # a directory where the unpaywall update data feed change files will be located
+  API_key: 
+  # API Key for the Unpaywall subscription is necessary to get the data feed change files for daily update
+
+# CORS parameters 
+corsAllowedOrigins: "*"
+corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
+corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
+
+# beyond the following number of requests, a 503 status will be returned (service unavailable) until enough
+# requests are processed to go beyond the max
+maxAcceptedRequests: 2048
+
+server:
+  type: custom
+  applicationConnectors:
+  - type: http
+    port: 8080
+  adminConnectors:
+  - type: http
+    port: 8081
+  registerDefaultExceptionMappers: false
+  maxThreads: 2048
+  maxQueuedRequests: 2048
+  acceptQueueSize: 2048
+
+logging:
+  level: INFO
+  appenders:
+  - type: console
+    threshold: INFO
+    timeZone: UTC
+#Docker-ignore-log-start
+  - type: file
+    currentLogFilename: logs/lookup-service.log
+    threshold: INFO
+    archive: true
+    archivedLogFilenamePattern: logs/lookup-service-%d-%i.log
+    archivedFileCount: 5
+    timeZone: UTC
+    maxFileSize: 50MB
+#Docker-ignore-log-end
+
+# the following is used only for pubmed related enrichments and extractions
+pubmed:
+  # path to the medline resources, to be changed according to your storage
+  pubmedDirectory: /media/lopez/data/biblio/medline2021/
+  # path to PMC mapping data
+  pmcDirectory: data/pmc
+  # elasticsearch index for pubmed, used to create extraction based on MeSH terms
+  index: pubmed
+  # path to the working pubmed databases, to be changed according to your storage 
+  dbDirectory: /media/lopez/T5/data2/db
diff --git a/config/glutton.yml b/config/glutton.yml
@@ -18,7 +18,7 @@ indexBatchSize: 500
 blockSize: 4
 
 # Grobid server URL
-grobidHost: http://localhost:8070/api
+grobidHost: ${GROBID_URL:- http://localhost:8070/api}
 
 timeZone: UTC
 # the day hour for lauching the automatic daily incremental update, format is HH:MM
@@ -27,7 +27,7 @@ dailyUpdateTime: 03:00
 # a node of the elasticsearch cluster to be used and a name for the index
 elastic:
   #host: localhost:9200
-  host: 0.0.0.0:9200
+  host: ${ELASTIC_URL:- localhost:9200}
   index: crossref
   maxConnections: 20
 
@@ -39,7 +39,7 @@ proxy:
 crossref:
   # a directory where the crossref incremental update files (gap or daily update) will be located
   # to be changed according to your storage
-  dumpPath: /media/lopez/data2/crossref
+  dumpPath: ${DUMP_PATH:- /media/lopez/data2/crossref}
 
   # indicate if we remove the incremental files after they have been processed (value true) or if
   # keep them in the above dumpPath (careful the volume of files can be huge after months of daily update!)
@@ -55,7 +55,7 @@ crossref:
   token:
 
 unpaywall:
-  dumpPath: 
+  dumpPath: ${DUMP_PATH:- /media/lopez/data2/unpaywall}
   # a directory where the unpaywall update data feed change files will be located
   API_key: 
   # API Key for the Unpaywall subscription is necessary to get the data feed change files for daily update
@@ -88,13 +88,16 @@ logging:
   - type: console
     threshold: INFO
     timeZone: UTC
+#Docker-ignore-log-start
   - type: file
     currentLogFilename: logs/lookup-service.log
     threshold: INFO
     archive: true
-    archivedLogFilenamePattern: logs/lookup-service-%d.log
+    archivedLogFilenamePattern: logs/lookup-service-%d-%i.log
     archivedFileCount: 5
     timeZone: UTC
+    maxFileSize: 50MB
+#Docker-ignore-log-end
 
 # the following is used only for pubmed related enrichments and extractions
 pubmed:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,24 +1,45 @@
-version: "2"
+version: "3.9"
 services:
   biblio:
-    build: .
+    image: lfoppiano/biblio-glutton-lookup:0.2
     restart: always
     ports:
       - 8080:8080
     volumes:
       - ./data:/app/data
+      - ./config/glutton-docker-compose.yml:/app/lookup/config/glutton.yml:ro
+    networks:
+      - net1
+
   grobid:
-    image: lfoppiano/grobid:0.5.5
+    image: lfoppiano/grobid:0.7.1
     restart: always
     ports:
       - 8070:8070
+    networks:
+      net1:
+        aliases:
+          - "grobid.local"
+
   elasticsearch:
-    image: elasticsearch:6.7.1
+    image: elasticsearch:7.17.3
     environment:
       - "discovery.type=single-node"
+    ports:
+      - 9200:9200
+      - 9300:9300
     volumes:
       - elasticsearch-data:/usr/share/elasticsearch/data
     restart: always
+    networks:
+      net1:
+        aliases:
+          - "elasticsearch.local"
+
 volumes:
   elasticsearch-data:
     driver: local
+
+
+networks:
+  net1:
diff --git a/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java b/lookup/src/main/java/com/scienceminer/lookup/web/LookupServiceApplication.java
@@ -19,6 +19,8 @@
 import com.scienceminer.lookup.storage.StorageEnvFactory;
 
 import io.dropwizard.Application;
+import io.dropwizard.configuration.EnvironmentVariableSubstitutor;
+import io.dropwizard.configuration.SubstitutingSourceProvider;
 import io.dropwizard.forms.MultiPartBundle;
 import io.dropwizard.setup.Bootstrap;
 import io.dropwizard.setup.Environment;
@@ -149,6 +151,9 @@ private List<? extends Module> getGuiceModules() {
 
     @Override
     public void initialize(Bootstrap<LookupConfiguration> bootstrap) {
+        bootstrap.setConfigurationSourceProvider(new SubstitutingSourceProvider(
+                bootstrap.getConfigurationSourceProvider(), new EnvironmentVariableSubstitutor(false)));
+
         GuiceBundle<LookupConfiguration> guiceBundle = GuiceBundle.defaultBuilder(LookupConfiguration.class)
                 .modules(getGuiceModules())
                 .build();