-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
nianlonggu
committed
Jul 28, 2023
1 parent
1fa9cc1
commit d91dc79
Showing
36 changed files
with
1,074 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
17 changes: 17 additions & 0 deletions
17
backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
version: '3' | ||
|
||
services: | ||
|
||
paper_database_manager: | ||
build: ./src/modules/paper_database | ||
image: paper_database_manager | ||
command: ["echo","hello"] | ||
|
||
document_prefetch_build_database_s2orc: | ||
build: ./src/build_database/S2ORC | ||
image: document_prefetch_build_database_s2orc | ||
environment: | ||
NUM_PROCESSES: 100 | ||
COLLECTION: S2ORC | ||
volumes: | ||
- $PWD/data/S2ORC:/app/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,18 @@ | ||
#!/bin/bash | ||
|
||
####### | ||
# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores | ||
# NUM_EMBEDDING_INDEX_SHARDS: Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs | ||
# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32 . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically | ||
|
||
|
||
#### prefetch server on arXiv | ||
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8021 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d | ||
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d | ||
|
||
|
||
#### prefetch server on PMCOA | ||
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8022 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d | ||
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d | ||
|
||
|
||
#### prefetch server on S2ORC | ||
DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc up --build -d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv down | ||
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa down | ||
docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc down |
3 changes: 0 additions & 3 deletions
3
backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
14 changes: 14 additions & 0 deletions
14
backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
FROM paper_database_manager as base | ||
|
||
ENV ROOT_DATA_PATH=/app/data | ||
|
||
SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"] | ||
|
||
WORKDIR /app/src | ||
COPY . . | ||
|
||
RUN pip install -r requirements.txt | ||
|
||
## Note: when calling docker run, one must map the host machine's volume to /app/data | ||
## The host volume is expected to contain all the data needed for the search engine | ||
CMD [ "bash", "run.sh" ] |
124 changes: 124 additions & 0 deletions
124
backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import subprocess | ||
import threading | ||
from tqdm import tqdm | ||
import os | ||
import numpy as np | ||
from raw_sqlite_utils import SqliteClient as RawSqliteClient | ||
import time | ||
import json | ||
from modules.paper_database.database_managers import SqliteClient | ||
import shutil | ||
|
||
|
||
# import os,sys,inspect | ||
# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) | ||
# root_dir = os.path.dirname(os.path.dirname(current_dir)) | ||
# sys.path.insert(0, root_dir) | ||
# sys.path.insert(0, current_dir) | ||
|
||
|
||
import argparse | ||
|
||
|
||
### get all needed environment variables | ||
ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH") | ||
COLLECTION = os.getenv("COLLECTION") | ||
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES")) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-metadata_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db" ) | ||
parser.add_argument("-pdf_parses_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db" ) | ||
parser.add_argument("-json_schema_path", default = "json_schema.json" ) | ||
parser.add_argument("-output_file_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/normalized_data.jsonl" ) | ||
parser.add_argument("-start", type = int, default = None ) | ||
parser.add_argument("-size", type =int, default = None ) | ||
parser.add_argument("-collection", default = COLLECTION ) | ||
parser.add_argument("-batch_size", type = int, default = 5000 ) | ||
parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES ) | ||
parser.add_argument("-output_sqlite_database_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/DB.db" ) | ||
|
||
args = parser.parse_args() | ||
|
||
metadata_sql = RawSqliteClient( args.metadata_raw_sql_path ) | ||
if args.start is None or args.size is None: | ||
print("No proper start and size value are specified, processing the whole document ...") | ||
print("Counting the total number of examples ...") | ||
args.start = 0 | ||
args.size = metadata_sql.get_max_rowid(args.collection) | ||
else: | ||
try: | ||
assert args.start is not None and args.size is not None | ||
assert args.start >= 0 and args.size >= 0 | ||
except: | ||
print("Error: Wrong start and size value were provided!") | ||
os.sys.exit(1) | ||
|
||
output_folder = os.path.dirname( args.output_file_name ) | ||
try: | ||
shutil.rmtree( output_folder ) | ||
except: | ||
pass | ||
os.makedirs( output_folder ) | ||
|
||
output_sqlite_database_folder = os.path.dirname( args.output_sqlite_database_name ) | ||
try: | ||
shutil.rmtree( output_sqlite_database_folder ) | ||
except: | ||
pass | ||
os.makedirs( output_sqlite_database_folder ) | ||
|
||
num_of_examples_per_process = int( np.ceil( args.size / args.n_processes ) ) | ||
print("Start multiple subprocesses ...") | ||
|
||
threads = [] | ||
for offset in range( args.start, args.start + args.size, num_of_examples_per_process ): | ||
t = threading.Thread( target = subprocess.run, args = ( | ||
list(map( str, [ | ||
"python", | ||
"normalize_raw_sqlite.py", | ||
"-metadata_raw_sql_path", args.metadata_raw_sql_path, | ||
"-pdf_parses_raw_sql_path", args.pdf_parses_raw_sql_path, | ||
"-json_schema_path", args.json_schema_path , | ||
"-output_file_name", args.output_file_name, | ||
"-output_file_name_suffix", "_%d"%( offset ), | ||
"-start", offset, | ||
"-size", min(num_of_examples_per_process, args.start + args.size - offset ), | ||
"-collection", args.collection, | ||
"-batch_size", args.batch_size | ||
] ) ) , | ||
) ) | ||
threads.append(t) | ||
t.start() | ||
for t in threads: | ||
t.join() | ||
|
||
|
||
print("Dumping to the final sqlite database, this may take time ...") | ||
|
||
final_sql = SqliteClient( args.output_sqlite_database_name ) | ||
|
||
output_base_name = os.path.basename( args.output_file_name ) | ||
flist =[ output_folder +"/"+fname for fname in os.listdir( output_folder ) if fname.startswith(output_base_name+"_") ] | ||
flist.sort( key = lambda x:int(x.split("_")[-1]) ) | ||
|
||
paper_buffer = [] | ||
for fname in flist: | ||
print(fname) | ||
with open( fname ,"r" ) as f: | ||
for line in f: | ||
line_data = json.loads(line) | ||
paper_buffer.append(line_data) | ||
|
||
if len(paper_buffer) >= args.batch_size: | ||
final_sql.insert_papers( paper_buffer, args.collection ) | ||
paper_buffer = [] | ||
os.remove( fname ) | ||
|
||
if len(paper_buffer)>0: | ||
final_sql.insert_papers( paper_buffer, args.collection ) | ||
paper_buffer = [] | ||
|
||
print("All Done!") |
58 changes: 58 additions & 0 deletions
58
backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from raw_sqlite_utils import SqliteClient | ||
import numpy as np | ||
import json | ||
import time | ||
import os | ||
from tqdm import tqdm | ||
import re | ||
import argparse | ||
|
||
def dump_to_sqlite( folder, db_path, buffer_size, paper_id_matcher, collection ): | ||
db_path_dir_name = os.path.dirname(db_path) | ||
if not os.path.exists( db_path_dir_name ): | ||
os.makedirs( db_path_dir_name ) | ||
|
||
flist = [folder + "/" + _ for _ in os.listdir(folder) if _.endswith(".jsonl") ] | ||
flist.sort( key = lambda x:int( x.split("_")[-1].split(".")[0] ) ) | ||
|
||
sql_client = SqliteClient(db_path) | ||
|
||
paper_list_buffer = [] | ||
for fname in flist: | ||
print(fname) | ||
with open( fname,"r" ) as f: | ||
for line in tqdm(f): | ||
paper_id = int(paper_id_matcher.findall(line[:50])[0]) | ||
paper_list_buffer.append( { "paper_id":paper_id,"Text":line } ) | ||
if len(paper_list_buffer) >= buffer_size: | ||
sql_client.insert_papers( collection, paper_list_buffer ) | ||
paper_list_buffer = [] | ||
if len( paper_list_buffer ) > 0: | ||
sql_client.insert_papers( collection, paper_list_buffer ) | ||
paper_list_buffer = [] | ||
|
||
ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH") | ||
COLLECTION = os.getenv("COLLECTION") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-metadata_jsonl_folder", default = ROOT_DATA_PATH + "/raw/metadata/" ) | ||
parser.add_argument("-metadata_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db") | ||
parser.add_argument("-pdf_parses_jsonl_folder", default = ROOT_DATA_PATH + "/raw/pdf_parses/") | ||
parser.add_argument("-pdf_parses_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db") | ||
parser.add_argument("-buffer_size", type = int, default = 1000 ) | ||
parser.add_argument("-collection", default = COLLECTION) | ||
args = parser.parse_args() | ||
|
||
|
||
paper_id_matcher = re.compile('(?<="paper_id": ")\d*(?=")') | ||
|
||
print("Converting metadata raw jsonl files to a single metadata sqlite ...") | ||
dump_to_sqlite( args.metadata_jsonl_folder, args.metadata_db_path, args.buffer_size, paper_id_matcher, args.collection ) | ||
|
||
print("Converting pdf_parses raw jsonl files to a single metadata sqlite ...") | ||
dump_to_sqlite( args.pdf_parses_jsonl_folder, args.pdf_parses_db_path, args.buffer_size, paper_id_matcher, args.collection ) | ||
|
||
print("All Done!") | ||
|
1 change: 1 addition & 0 deletions
1
backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]} |
Oops, something went wrong.