Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
nianlonggu committed Jul 28, 2023
1 parent 1fa9cc1 commit d91dc79
Show file tree
Hide file tree
Showing 36 changed files with 1,074 additions and 86 deletions.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,32 @@ node -v
```

## Prepare Raw Corpus
Here we demonstrated building the search engine on papers from PubMed Open Access (PMCOA) and arXiv.
Here we demonstrated building the search engine on papers from S2ORC, PubMed Open Access (PMCOA) and arXiv.
### S2ORC
S2ORC can be accessed via [semantic scholar api](https://www.semanticscholar.org/product/api). Here we only download a tiny subset to demonstrate the pipeline.

```bash
mkdir -p backend/1_document_prefetch/data/S2ORC/raw/metadata
mkdir -p backend/1_document_prefetch/data/S2ORC/raw/pdf_parses
wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_0.jsonl
wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_1.jsonl
wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_0.jsonl
wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_1.jsonl

```
This is how the files are organized:
```
backend/1_document_prefetch/data/S2ORC/
└── raw
├── metadata
│ ├── metadata_0.jsonl
│ └── metadata_1.jsonl
└── pdf_parses
├── pdf_parses_0.jsonl
└── pdf_parses_1.jsonl
```


### PMCOA
We can download the .tar.gz files from the official FTP service https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/, and put the files into the folder:
```
Expand Down Expand Up @@ -208,6 +233,7 @@ cd $BASE_DIR/backend/5_title_generic_search && docker-compose up --build -d
cd $BASE_DIR/backend/final_api_gateway && docker-compose up --build -d

cd $BASE_DIR

```

By default, port 8060 is used by the final API gateway to communicate with the frontend or API developers.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ services:
image: document_prefetch_build_database_pmcoa
environment:
NUM_PROCESSES: 100
COLLECTION: PMCOA
volumes:
- $PWD/data/PMCOA:/app/data
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
version: '3'

services:

paper_database_manager:
build: ./src/modules/paper_database
image: paper_database_manager
command: ["echo","hello"]

document_prefetch_build_database_s2orc:
build: ./src/build_database/S2ORC
image: document_prefetch_build_database_s2orc
environment:
NUM_PROCESSES: 100
COLLECTION: S2ORC
volumes:
- $PWD/data/S2ORC:/app/data
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ services:
image: document_prefetch_build_database_arxiv
environment:
NUM_PROCESSES: 100
COLLECTION: arXiv
volumes:
- $PWD/data/arXiv:/app/data
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ version: '3'

### environment variables
# NUM_PROCESSES
# NUM_EMBEDDING_INDEX_SHARDS
# DATA_PATH
# PRIVATE
# USE_GPU
# EMBEDDING_INDEX_PRECISION
# SERVICE_SUFFIX

services:
Expand All @@ -21,7 +23,8 @@ services:
depends_on:
- document_prefetch_base
environment:
NUM_EMBEDDING_INDEX_SHARDS: ${NUM_PROCESSES}
NUM_PROCESSES: ${NUM_PROCESSES}
NUM_EMBEDDING_INDEX_SHARDS: ${NUM_EMBEDDING_INDEX_SHARDS}
NUM_INVERTED_INDEX_SHARDS: 10
SERVICE_SUFFIX: ${SERVICE_SUFFIX}
volumes:
Expand All @@ -40,6 +43,7 @@ services:
NVIDIA_VISIBLE_DEVICES: all
IS_PRIVATE_SERVER: ${PRIVATE}
USE_GPU: ${USE_GPU}
EMBEDDING_INDEX_PRECISION: ${EMBEDDING_INDEX_PRECISION}
SERVICE_SUFFIX: ${SERVICE_SUFFIX}
volumes:
- ${DATA_PATH}:/app/data
Expand Down Expand Up @@ -89,8 +93,8 @@ services:
image: document_prefetch_service_overall
environment:
SERVICE_SUFFIX: ${SERVICE_SUFFIX}
ports:
- ${PORT}:8060
# ports:
# - ${PORT}:8060
networks:
- common_network
hostname: document_prefetch_service_overall_${SERVICE_SUFFIX}
Expand Down
2 changes: 2 additions & 0 deletions backend/1_document_prefetch/script_build_all_databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ docker-compose -f docker-compose-build-database-arXiv.yaml up --build

docker-compose -f docker-compose-build-database-PMCOA.yaml up --build

docker-compose -f docker-compose-build-database-S2ORC.yaml up --build

13 changes: 11 additions & 2 deletions backend/1_document_prefetch/script_start_all_services.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
#!/bin/bash

#######
# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores
# NUM_EMBEDDING_INDEX_SHARDS: Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs
# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32 . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically


#### prefetch server on arXiv
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8021 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d


#### prefetch server on PMCOA
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8022 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d


#### prefetch server on S2ORC
DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc up --build -d
3 changes: 3 additions & 0 deletions backend/1_document_prefetch/script_stop_all_services.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv down
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa down
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
FROM paper_database_manager as base

ENV COLLECTION="PMCOA"
ENV ROOT_DATA_PATH=/app/data
## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env)
ENV NUM_PROCESSES=16

SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]

Expand Down
14 changes: 14 additions & 0 deletions backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM paper_database_manager as base

ENV ROOT_DATA_PATH=/app/data

SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]

WORKDIR /app/src
COPY . .

RUN pip install -r requirements.txt

## Note: when calling docker run, one must map the host machine's volume to /app/data
## The host volume is expected to contain all the data needed for the search engine
CMD [ "bash", "run.sh" ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import subprocess
import threading
from tqdm import tqdm
import os
import numpy as np
from raw_sqlite_utils import SqliteClient as RawSqliteClient
import time
import json
from modules.paper_database.database_managers import SqliteClient
import shutil


# import os,sys,inspect
# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
# root_dir = os.path.dirname(os.path.dirname(current_dir))
# sys.path.insert(0, root_dir)
# sys.path.insert(0, current_dir)


import argparse


### get all needed environment variables
ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
COLLECTION = os.getenv("COLLECTION")
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES"))


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("-metadata_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db" )
parser.add_argument("-pdf_parses_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db" )
parser.add_argument("-json_schema_path", default = "json_schema.json" )
parser.add_argument("-output_file_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/normalized_data.jsonl" )
parser.add_argument("-start", type = int, default = None )
parser.add_argument("-size", type =int, default = None )
parser.add_argument("-collection", default = COLLECTION )
parser.add_argument("-batch_size", type = int, default = 5000 )
parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES )
parser.add_argument("-output_sqlite_database_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/DB.db" )

args = parser.parse_args()

metadata_sql = RawSqliteClient( args.metadata_raw_sql_path )
if args.start is None or args.size is None:
print("No proper start and size value are specified, processing the whole document ...")
print("Counting the total number of examples ...")
args.start = 0
args.size = metadata_sql.get_max_rowid(args.collection)
else:
try:
assert args.start is not None and args.size is not None
assert args.start >= 0 and args.size >= 0
except:
print("Error: Wrong start and size value were provided!")
os.sys.exit(1)

output_folder = os.path.dirname( args.output_file_name )
try:
shutil.rmtree( output_folder )
except:
pass
os.makedirs( output_folder )

output_sqlite_database_folder = os.path.dirname( args.output_sqlite_database_name )
try:
shutil.rmtree( output_sqlite_database_folder )
except:
pass
os.makedirs( output_sqlite_database_folder )

num_of_examples_per_process = int( np.ceil( args.size / args.n_processes ) )
print("Start multiple subprocesses ...")

threads = []
for offset in range( args.start, args.start + args.size, num_of_examples_per_process ):
t = threading.Thread( target = subprocess.run, args = (
list(map( str, [
"python",
"normalize_raw_sqlite.py",
"-metadata_raw_sql_path", args.metadata_raw_sql_path,
"-pdf_parses_raw_sql_path", args.pdf_parses_raw_sql_path,
"-json_schema_path", args.json_schema_path ,
"-output_file_name", args.output_file_name,
"-output_file_name_suffix", "_%d"%( offset ),
"-start", offset,
"-size", min(num_of_examples_per_process, args.start + args.size - offset ),
"-collection", args.collection,
"-batch_size", args.batch_size
] ) ) ,
) )
threads.append(t)
t.start()
for t in threads:
t.join()


print("Dumping to the final sqlite database, this may take time ...")

final_sql = SqliteClient( args.output_sqlite_database_name )

output_base_name = os.path.basename( args.output_file_name )
flist =[ output_folder +"/"+fname for fname in os.listdir( output_folder ) if fname.startswith(output_base_name+"_") ]
flist.sort( key = lambda x:int(x.split("_")[-1]) )

paper_buffer = []
for fname in flist:
print(fname)
with open( fname ,"r" ) as f:
for line in f:
line_data = json.loads(line)
paper_buffer.append(line_data)

if len(paper_buffer) >= args.batch_size:
final_sql.insert_papers( paper_buffer, args.collection )
paper_buffer = []
os.remove( fname )

if len(paper_buffer)>0:
final_sql.insert_papers( paper_buffer, args.collection )
paper_buffer = []

print("All Done!")
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from raw_sqlite_utils import SqliteClient
import numpy as np
import json
import time
import os
from tqdm import tqdm
import re
import argparse

def dump_to_sqlite( folder, db_path, buffer_size, paper_id_matcher, collection ):
db_path_dir_name = os.path.dirname(db_path)
if not os.path.exists( db_path_dir_name ):
os.makedirs( db_path_dir_name )

flist = [folder + "/" + _ for _ in os.listdir(folder) if _.endswith(".jsonl") ]
flist.sort( key = lambda x:int( x.split("_")[-1].split(".")[0] ) )

sql_client = SqliteClient(db_path)

paper_list_buffer = []
for fname in flist:
print(fname)
with open( fname,"r" ) as f:
for line in tqdm(f):
paper_id = int(paper_id_matcher.findall(line[:50])[0])
paper_list_buffer.append( { "paper_id":paper_id,"Text":line } )
if len(paper_list_buffer) >= buffer_size:
sql_client.insert_papers( collection, paper_list_buffer )
paper_list_buffer = []
if len( paper_list_buffer ) > 0:
sql_client.insert_papers( collection, paper_list_buffer )
paper_list_buffer = []

ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
COLLECTION = os.getenv("COLLECTION")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-metadata_jsonl_folder", default = ROOT_DATA_PATH + "/raw/metadata/" )
parser.add_argument("-metadata_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db")
parser.add_argument("-pdf_parses_jsonl_folder", default = ROOT_DATA_PATH + "/raw/pdf_parses/")
parser.add_argument("-pdf_parses_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db")
parser.add_argument("-buffer_size", type = int, default = 1000 )
parser.add_argument("-collection", default = COLLECTION)
args = parser.parse_args()


paper_id_matcher = re.compile('(?<="paper_id": ")\d*(?=")')

print("Converting metadata raw jsonl files to a single metadata sqlite ...")
dump_to_sqlite( args.metadata_jsonl_folder, args.metadata_db_path, args.buffer_size, paper_id_matcher, args.collection )

print("Converting pdf_parses raw jsonl files to a single metadata sqlite ...")
dump_to_sqlite( args.pdf_parses_jsonl_folder, args.pdf_parses_db_path, args.buffer_size, paper_id_matcher, args.collection )

print("All Done!")

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]}
Loading

0 comments on commit d91dc79

Please sign in to comment.