diff --git a/README.md b/README.md index 1990edf..e6343ca 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,32 @@ node -v ``` ## Prepare Raw Corpus -Here we demonstrated building the search engine on papers from PubMed Open Access (PMCOA) and arXiv. +Here we demonstrated building the search engine on papers from S2ORC, PubMed Open Access (PMCOA) and arXiv. +### S2ORC +S2ORC can be accessed via [semantic scholar api](https://www.semanticscholar.org/product/api). Here we only download a tiny subset to demonstrate the pipeline. + +```bash +mkdir -p backend/1_document_prefetch/data/S2ORC/raw/metadata +mkdir -p backend/1_document_prefetch/data/S2ORC/raw/pdf_parses +wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_0.jsonl +wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_1.jsonl +wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_0.jsonl +wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_1.jsonl + +``` +This is how the files are organized: +``` +backend/1_document_prefetch/data/S2ORC/ +└── raw + ├── metadata + │ ├── metadata_0.jsonl + │ └── metadata_1.jsonl + └── pdf_parses + ├── pdf_parses_0.jsonl + └── pdf_parses_1.jsonl +``` + + ### PMCOA We can download the .tar.gz files from the official FTP service https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/, and put the files into the folder: ``` @@ -208,6 +233,7 @@ cd $BASE_DIR/backend/5_title_generic_search && docker-compose up --build -d cd $BASE_DIR/backend/final_api_gateway && docker-compose up --build -d cd $BASE_DIR + ``` By default, port 8060 is used by the final API gateway to communicate with the frontend or API developers. diff --git a/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml b/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml index 43d93a7..783e33a 100644 --- a/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml +++ b/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml @@ -12,5 +12,6 @@ services: image: document_prefetch_build_database_pmcoa environment: NUM_PROCESSES: 100 + COLLECTION: PMCOA volumes: - $PWD/data/PMCOA:/app/data diff --git a/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml b/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml new file mode 100644 index 0000000..03e6209 --- /dev/null +++ b/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml @@ -0,0 +1,17 @@ +version: '3' + +services: + + paper_database_manager: + build: ./src/modules/paper_database + image: paper_database_manager + command: ["echo","hello"] + + document_prefetch_build_database_s2orc: + build: ./src/build_database/S2ORC + image: document_prefetch_build_database_s2orc + environment: + NUM_PROCESSES: 100 + COLLECTION: S2ORC + volumes: + - $PWD/data/S2ORC:/app/data diff --git a/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml b/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml index 1c6004d..f55136e 100644 --- a/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml +++ b/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml @@ -17,5 +17,6 @@ services: image: document_prefetch_build_database_arxiv environment: NUM_PROCESSES: 100 + COLLECTION: arXiv volumes: - $PWD/data/arXiv:/app/data diff --git a/backend/1_document_prefetch/docker-compose-document-prefetch.yaml b/backend/1_document_prefetch/docker-compose-document-prefetch.yaml index 04f3ad7..7967155 100644 --- a/backend/1_document_prefetch/docker-compose-document-prefetch.yaml +++ b/backend/1_document_prefetch/docker-compose-document-prefetch.yaml @@ -2,9 +2,11 @@ version: '3' ### environment variables # NUM_PROCESSES +# NUM_EMBEDDING_INDEX_SHARDS # DATA_PATH # PRIVATE # USE_GPU +# EMBEDDING_INDEX_PRECISION # SERVICE_SUFFIX services: @@ -21,7 +23,8 @@ services: depends_on: - document_prefetch_base environment: - NUM_EMBEDDING_INDEX_SHARDS: ${NUM_PROCESSES} + NUM_PROCESSES: ${NUM_PROCESSES} + NUM_EMBEDDING_INDEX_SHARDS: ${NUM_EMBEDDING_INDEX_SHARDS} NUM_INVERTED_INDEX_SHARDS: 10 SERVICE_SUFFIX: ${SERVICE_SUFFIX} volumes: @@ -40,6 +43,7 @@ services: NVIDIA_VISIBLE_DEVICES: all IS_PRIVATE_SERVER: ${PRIVATE} USE_GPU: ${USE_GPU} + EMBEDDING_INDEX_PRECISION: ${EMBEDDING_INDEX_PRECISION} SERVICE_SUFFIX: ${SERVICE_SUFFIX} volumes: - ${DATA_PATH}:/app/data @@ -89,8 +93,8 @@ services: image: document_prefetch_service_overall environment: SERVICE_SUFFIX: ${SERVICE_SUFFIX} - ports: - - ${PORT}:8060 + # ports: + # - ${PORT}:8060 networks: - common_network hostname: document_prefetch_service_overall_${SERVICE_SUFFIX} diff --git a/backend/1_document_prefetch/script_build_all_databases.sh b/backend/1_document_prefetch/script_build_all_databases.sh index 953c642..c1396d6 100644 --- a/backend/1_document_prefetch/script_build_all_databases.sh +++ b/backend/1_document_prefetch/script_build_all_databases.sh @@ -4,3 +4,5 @@ docker-compose -f docker-compose-build-database-arXiv.yaml up --build docker-compose -f docker-compose-build-database-PMCOA.yaml up --build +docker-compose -f docker-compose-build-database-S2ORC.yaml up --build + diff --git a/backend/1_document_prefetch/script_start_all_services.sh b/backend/1_document_prefetch/script_start_all_services.sh index 88fb601..9fb044b 100644 --- a/backend/1_document_prefetch/script_start_all_services.sh +++ b/backend/1_document_prefetch/script_start_all_services.sh @@ -1,9 +1,18 @@ #!/bin/bash +####### +# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores +# NUM_EMBEDDING_INDEX_SHARDS: Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs +# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32 . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically + #### prefetch server on arXiv -DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8021 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d +DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d #### prefetch server on PMCOA -DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8022 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d +DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d + + +#### prefetch server on S2ORC +DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc up --build -d diff --git a/backend/1_document_prefetch/script_stop_all_services.sh b/backend/1_document_prefetch/script_stop_all_services.sh new file mode 100644 index 0000000..9107137 --- /dev/null +++ b/backend/1_document_prefetch/script_stop_all_services.sh @@ -0,0 +1,3 @@ +docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv down +docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa down +docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc down \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile b/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile index a9f1928..13843de 100644 --- a/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile +++ b/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile @@ -1,9 +1,6 @@ FROM paper_database_manager as base -ENV COLLECTION="PMCOA" ENV ROOT_DATA_PATH=/app/data -## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env) -ENV NUM_PROCESSES=16 SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"] diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile b/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile new file mode 100644 index 0000000..c23a987 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile @@ -0,0 +1,14 @@ +FROM paper_database_manager as base + +ENV ROOT_DATA_PATH=/app/data + +SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"] + +WORKDIR /app/src +COPY . . + +RUN pip install -r requirements.txt + +## Note: when calling docker run, one must map the host machine's volume to /app/data +## The host volume is expected to contain all the data needed for the search engine +CMD [ "bash", "run.sh" ] diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py new file mode 100644 index 0000000..07bc70c --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py @@ -0,0 +1,124 @@ +import subprocess +import threading +from tqdm import tqdm +import os +import numpy as np +from raw_sqlite_utils import SqliteClient as RawSqliteClient +import time +import json +from modules.paper_database.database_managers import SqliteClient +import shutil + + +# import os,sys,inspect +# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +# root_dir = os.path.dirname(os.path.dirname(current_dir)) +# sys.path.insert(0, root_dir) +# sys.path.insert(0, current_dir) + + +import argparse + + +### get all needed environment variables +ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH") +COLLECTION = os.getenv("COLLECTION") +NUM_PROCESSES = int(os.getenv("NUM_PROCESSES")) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("-metadata_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db" ) + parser.add_argument("-pdf_parses_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db" ) + parser.add_argument("-json_schema_path", default = "json_schema.json" ) + parser.add_argument("-output_file_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/normalized_data.jsonl" ) + parser.add_argument("-start", type = int, default = None ) + parser.add_argument("-size", type =int, default = None ) + parser.add_argument("-collection", default = COLLECTION ) + parser.add_argument("-batch_size", type = int, default = 5000 ) + parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES ) + parser.add_argument("-output_sqlite_database_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/DB.db" ) + + args = parser.parse_args() + + metadata_sql = RawSqliteClient( args.metadata_raw_sql_path ) + if args.start is None or args.size is None: + print("No proper start and size value are specified, processing the whole document ...") + print("Counting the total number of examples ...") + args.start = 0 + args.size = metadata_sql.get_max_rowid(args.collection) + else: + try: + assert args.start is not None and args.size is not None + assert args.start >= 0 and args.size >= 0 + except: + print("Error: Wrong start and size value were provided!") + os.sys.exit(1) + + output_folder = os.path.dirname( args.output_file_name ) + try: + shutil.rmtree( output_folder ) + except: + pass + os.makedirs( output_folder ) + + output_sqlite_database_folder = os.path.dirname( args.output_sqlite_database_name ) + try: + shutil.rmtree( output_sqlite_database_folder ) + except: + pass + os.makedirs( output_sqlite_database_folder ) + + num_of_examples_per_process = int( np.ceil( args.size / args.n_processes ) ) + print("Start multiple subprocesses ...") + + threads = [] + for offset in range( args.start, args.start + args.size, num_of_examples_per_process ): + t = threading.Thread( target = subprocess.run, args = ( + list(map( str, [ + "python", + "normalize_raw_sqlite.py", + "-metadata_raw_sql_path", args.metadata_raw_sql_path, + "-pdf_parses_raw_sql_path", args.pdf_parses_raw_sql_path, + "-json_schema_path", args.json_schema_path , + "-output_file_name", args.output_file_name, + "-output_file_name_suffix", "_%d"%( offset ), + "-start", offset, + "-size", min(num_of_examples_per_process, args.start + args.size - offset ), + "-collection", args.collection, + "-batch_size", args.batch_size + ] ) ) , + ) ) + threads.append(t) + t.start() + for t in threads: + t.join() + + + print("Dumping to the final sqlite database, this may take time ...") + + final_sql = SqliteClient( args.output_sqlite_database_name ) + + output_base_name = os.path.basename( args.output_file_name ) + flist =[ output_folder +"/"+fname for fname in os.listdir( output_folder ) if fname.startswith(output_base_name+"_") ] + flist.sort( key = lambda x:int(x.split("_")[-1]) ) + + paper_buffer = [] + for fname in flist: + print(fname) + with open( fname ,"r" ) as f: + for line in f: + line_data = json.loads(line) + paper_buffer.append(line_data) + + if len(paper_buffer) >= args.batch_size: + final_sql.insert_papers( paper_buffer, args.collection ) + paper_buffer = [] + os.remove( fname ) + + if len(paper_buffer)>0: + final_sql.insert_papers( paper_buffer, args.collection ) + paper_buffer = [] + + print("All Done!") \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py new file mode 100644 index 0000000..cd79448 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py @@ -0,0 +1,58 @@ +from raw_sqlite_utils import SqliteClient +import numpy as np +import json +import time +import os +from tqdm import tqdm +import re +import argparse + +def dump_to_sqlite( folder, db_path, buffer_size, paper_id_matcher, collection ): + db_path_dir_name = os.path.dirname(db_path) + if not os.path.exists( db_path_dir_name ): + os.makedirs( db_path_dir_name ) + + flist = [folder + "/" + _ for _ in os.listdir(folder) if _.endswith(".jsonl") ] + flist.sort( key = lambda x:int( x.split("_")[-1].split(".")[0] ) ) + + sql_client = SqliteClient(db_path) + + paper_list_buffer = [] + for fname in flist: + print(fname) + with open( fname,"r" ) as f: + for line in tqdm(f): + paper_id = int(paper_id_matcher.findall(line[:50])[0]) + paper_list_buffer.append( { "paper_id":paper_id,"Text":line } ) + if len(paper_list_buffer) >= buffer_size: + sql_client.insert_papers( collection, paper_list_buffer ) + paper_list_buffer = [] + if len( paper_list_buffer ) > 0: + sql_client.insert_papers( collection, paper_list_buffer ) + paper_list_buffer = [] + +ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH") +COLLECTION = os.getenv("COLLECTION") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-metadata_jsonl_folder", default = ROOT_DATA_PATH + "/raw/metadata/" ) + parser.add_argument("-metadata_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db") + parser.add_argument("-pdf_parses_jsonl_folder", default = ROOT_DATA_PATH + "/raw/pdf_parses/") + parser.add_argument("-pdf_parses_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db") + parser.add_argument("-buffer_size", type = int, default = 1000 ) + parser.add_argument("-collection", default = COLLECTION) + args = parser.parse_args() + + + paper_id_matcher = re.compile('(?<="paper_id": ")\d*(?=")') + + print("Converting metadata raw jsonl files to a single metadata sqlite ...") + dump_to_sqlite( args.metadata_jsonl_folder, args.metadata_db_path, args.buffer_size, paper_id_matcher, args.collection ) + + print("Converting pdf_parses raw jsonl files to a single metadata sqlite ...") + dump_to_sqlite( args.pdf_parses_jsonl_folder, args.pdf_parses_db_path, args.buffer_size, paper_id_matcher, args.collection ) + + print("All Done!") + \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json b/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json new file mode 100644 index 0000000..97c33be --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json @@ -0,0 +1 @@ +{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]} \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py b/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py new file mode 100644 index 0000000..f8cb3ba --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py @@ -0,0 +1,424 @@ +import re +import time +import numpy as np +from jsonschema import validate +from nltk.tokenize import sent_tokenize +import json +class DocumentNormalizer: + def __init__(self, json_schema_path ): + self.json_schema = json.load(open(json_schema_path,"r")) + self.cit_marker_matcher = re.compile("(^[^A-Za-z\d]*)([0-9]+)(?=[^A-Za-z\d]*$)") + self.sentence_boundary_matcher = re.compile("\.\s") + + def normalize( self, paper, requires_validation = True ): + ##### Author ##### + parsed_authors = self.parse_author( paper ) + ##### Title ##### + parsed_title = self.parse_title( paper ) + ##### Venue ##### + parsed_venue = self.parse_venue( paper ) + ##### DOI ##### + parsed_doi = self.parse_doi(paper) + ##### URL ##### + parsed_url = self.parse_url(paper) + ##### PublicationDate ##### + parsed_pub_date = self.parse_pub_date(paper) + ##### Reference ##### + parsed_reference, bib_entry_key_to_row_id_mapper = self.parse_reference(paper) + + ##### Content ##### + parsed_content = self.parse_content(paper, bib_entry_key_to_row_id_mapper) + ##### Abstract (The abstract text stored in the metadata) ##### + abstract_text = (" ".join(self.get_sentence_list_from_parsed_sections( parsed_content["Abstract_Parsed"] ))).strip() + + ##### Last_update_unixtime ###### + Last_update_unixtime = int(time.time()) + ##### Others ##### + Abstract_in_metadata = abstract_text != "" + isDuplicated = False + + normalized_paper = { + "Author":parsed_authors, + "Title":parsed_title, + "Abstract":abstract_text, + "Venue":parsed_venue, + "DOI":parsed_doi, + "URL":parsed_url, + "PublicationDate":parsed_pub_date, + "Content":parsed_content, + "Reference":parsed_reference, + "Last_update_unixtime":Last_update_unixtime, + "Abstract_in_metadata":Abstract_in_metadata, + "isDuplicated":isDuplicated + } + + ##### Additional IDs, this is only added for S2ORC dataset ##### + additional_ids = self.parse_additional_ids(paper) + normalized_paper.update( additional_ids ) + + if requires_validation: + try: + validate(instance=normalized_paper, schema=self.json_schema) + except: + return None + return normalized_paper + + def get_sentence_list_from_parsed_sections(self, parsed_sections ): + sentence_list = [] + for section in parsed_sections: + sentence_list.append(str(section.get( "section_title", "" ))) + for para in section.get("section_text",[]): + for sen in para.get("paragraph_text", []): + sentence_list.append( str(sen.get("sentence_text","")) ) + return sentence_list + + + def parse_author(self, paper ): + try: + parsed_authors = [] + authors = paper.get("authors", [] ) + for author in authors: + parsed_authors.append( + { + "GivenName":str( author.get( "first", "" ).replace("None","") ), + "FamilyName":str( author.get( "last", "" ).replace("None","") ) + } + ) + except: + parsed_authors = [] + return parsed_authors + + def parse_title(self, paper ): + try: + parsed_title = str(paper.get("title", "")).replace("None","").lstrip("[").rstrip("]") + except: + parsed_title = "" + return parsed_title + + def parse_venue(self, paper): + try: + parsed_venue = str(paper.get("venue", "")).replace("None","") + except: + parsed_venue = "" + + if parsed_venue.strip() == "": + try: + parsed_venue = str(paper.get("journal","")).replace("None","") + except: + parsed_venue = "" + return parsed_venue + + def parse_doi(self, paper): + try: + parsed_doi = str( paper.get("doi","") ).replace("None","") + except: + parsed_doi = "" + return parsed_doi + + def parse_url(self, paper): + try: + parsed_doi = str(paper.get("doi","")).strip().replace("%", "%25").replace('"', "%22").replace("#", "%23").replace(" ", "%20").replace("?", "%3F").replace("None","") + if parsed_doi.strip() != "": + parsed_url = "https://doi.org/" + parsed_doi + else: + parsed_url = str(paper.get("s2_url", "")) + except: + parsed_url = "" + return parsed_url + + + def parse_pub_date( self, paper ): + try: + year = str(int(paper.get("year", ""))).replace("None","") + except: + year = "" + return { + "Year":year + } + + def parse_para( self, para, bib_entry_key_to_row_id_mapper ): + paragraph_text = [{ "sentence_id":str(sen_id), "sentence_text": str(sen), "cite_spans":[] } + for sen_id, sen in enumerate(self.sent_tok( str(para.get("text",""))) )] + para_cite_spans = para.get( "cite_spans", [] ) + for cite_span in para_cite_spans: + start, end = cite_span["start"], cite_span["end"] + for sen in paragraph_text: + if start < len( sen["sentence_text"] ): + end = min( end, len( sen["sentence_text"] ) ) + sen["cite_spans"].append( + { + "start":start, + "end":end, + "text":sen["sentence_text"][start:end], + "ref_id":cite_span["ref_id"] + } + ) + break + else: + start -= len( sen["sentence_text"] ) + end -= len( sen["sentence_text"] ) + cleaned_paragraph_text = [] + for sen in paragraph_text: + sentence_text = sen["sentence_text"] + cite_spans = sen["cite_spans"] + + sentence_text = sentence_text.rstrip() + + cite_spans.sort( key= lambda x:x["start"] ) + + cleaned_cite_spans = [] + for sen_cite_span in cite_spans: + if sen_cite_span["ref_id"] not in bib_entry_key_to_row_id_mapper: + continue + + start, end = sen_cite_span["start"], sen_cite_span["end"] + ## make sure ther is no overlapping between multiple citation markers + if len(cleaned_cite_spans) > 0 and start < int(cleaned_cite_spans[-1]["end"]): + continue + + if start >= len(sentence_text): + continue + end = min( end, len(sentence_text) ) + + sen_cite_span["start"] = str(start) + sen_cite_span["end"] = str(end) + sen_cite_span["text"] = sentence_text[start:end] + sen_cite_span["ref_id"] = str(bib_entry_key_to_row_id_mapper[ sen_cite_span["ref_id"] ]) + + cleaned_cite_spans.append( sen_cite_span ) + + sentence_id = str(len(cleaned_paragraph_text)) + cleaned_paragraph_text.append( + { + "sentence_id":sentence_id, + "sentence_text":sentence_text, + "cite_spans":cleaned_cite_spans + } + ) + + return cleaned_paragraph_text + + + def parse_para_list( self, para_list, bib_entry_key_to_row_id_mapper ): + section_list = [] + current_section = None + + for para in para_list: + paragraph_text = self.parse_para( para, bib_entry_key_to_row_id_mapper ) + + para_section = str(para.get("section","")) + + if current_section is None or (para_section != "" and para_section != current_section["section_title"]): + if current_section is not None: + section_list.append(current_section) + current_section = { + "section_id":str(len(section_list)), + "section_title":para_section, + "section_text":[ + { + "paragraph_id":"0", + "paragraph_text":paragraph_text + } + ] + } + else: + next_para_id = len(current_section["section_text"]) + current_section["section_text"].append( + { + "paragraph_id":str(next_para_id), + "paragraph_text":paragraph_text + } + ) + if current_section is not None: + section_list.append(current_section) + + if (" ".join(self.get_sentence_list_from_parsed_sections( section_list ))).strip() == "": + section_list = [] + + return section_list + + def parse_content( self, paper, bib_entry_key_to_row_id_mapper ): + ### Abstract + abstract = "" + ### Abstract_Parsed + try: + pdf_parsed_abstract = paper.get("pdf_parses",{}).get("abstract",[]) + if len( pdf_parsed_abstract ) == 0: + abstract_text = str(paper.get("abstract","")) + if abstract_text != "None" and abstract_text != "": + pdf_parsed_abstract = [ { "section":"Abstract", "text":abstract_text } ] + assert len(pdf_parsed_abstract) > 0 + + abstract_parsed = self.parse_para_list( pdf_parsed_abstract, bib_entry_key_to_row_id_mapper ) + except: + abstract_parsed = [] + + ### Fullbody + fullbody = "" + + ### Fullbody_Parsed + try: + fullbody_parsed = self.parse_para_list( paper.get( "pdf_parses", {} ).get("body_text", []), bib_entry_key_to_row_id_mapper ) + except: + fullbody_parsed = [] + return { + "Abstract":abstract, + "Abstract_Parsed":abstract_parsed, + "Fullbody":fullbody, + "Fullbody_Parsed":fullbody_parsed + } + + def parse_reference(self, paper): + try: + bibref_text = {} + body_text = paper.get("pdf_parses",{}).get("body_text", []) + for para in body_text: + for cit in para.get("cite_spans", []): + if isinstance(cit, dict): + ref_id, ref_text = cit.get("ref_id",""), cit.get("text","") + if ref_id != "": + bibref_text[ref_id] = ref_text + + for ref_id in bibref_text: + ref_text = bibref_text[ref_id] + matched_texts = self.cit_marker_matcher.findall(ref_text) + if len(matched_texts) > 0: + ref_text = matched_texts[0][1]+"." + else: + ref_text = "" + bibref_text[ref_id] = ref_text + + except: + bibref_text = {} + + try: + reference = [] + bib_entry_key_to_row_id_mapper = {} + + bib_entries = paper.get("pdf_parses",{}).get("bib_entries",{}) + bib_entry_keys = list(bib_entries.keys()) + try: + bib_entry_keys.sort( key = lambda x : int(x[6:]) ) + except: + pass + + for bib_entry_key in bib_entry_keys: + try: + parsed_entry = self.convert_bibentry_to_metadata( bib_entries[bib_entry_key] ) + reference_text = self.get_citation_from_paper_metadata(parsed_entry) + if bibref_text.get(bib_entry_key,"").strip() != "": + reference_text = bibref_text[bib_entry_key] + " "+ reference_text + parsed_entry["ReferenceText"] = reference_text + + bib_entry_key_to_row_id_mapper[bib_entry_key] = len(reference) + reference.append(parsed_entry) + except: + continue + except: + reference = [] + bib_entry_key_to_row_id_mapper = {} + + return reference, bib_entry_key_to_row_id_mapper + + def parse_additional_ids(self, paper): + try: + S2CID = str(paper.get("paper_id", "")).replace("None","") + PMID = str(paper.get("pubmed_id", "")).replace("None","") + PMCID = str(paper.get("pmc_id", "")).replace("None","") + ArxivId = str(paper.get("arxiv_id", "")).replace("None","") + ACLId = str(paper.get("acl_id","")).replace("None","") + MAGId = str(paper.get("mag_id","")).replace("None","") + except: + S2CID = "" + PMID = "" + PMCID = "" + ArxivId = "" + ACLId = "" + MAGId = "" + return { + "S2CID":S2CID, + "PMID":PMID, + "PMCID":PMCID, + "ArxivId":ArxivId, + "ACLId":ACLId, + "MAGId":MAGId + } + + + def sent_tok(self, text, min_sen_len = 10 ): + + sens = self.sentence_boundary_matcher.split( text ) + for pos in range( len(sens)-1 ): + sens[pos] += ". " + + return self.merge_sens( sens, min_sen_len = min_sen_len ) + + def merge_sens(self, sens, min_sen_len = 10 ): + out_sens =[] + current_sen = None + + for sen in sens: + sen_len = len(sen.split()) + if sen_len >= min_sen_len: + if current_sen is not None: + out_sens.append( current_sen ) + current_sen = sen + else: + if current_sen is not None: + current_sen += sen + else: + current_sen = sen + if current_sen is not None: + if len( current_sen.split() ) < min_sen_len and len( out_sens ) > 0: + out_sens[-1] += current_sen + else: + out_sens.append(current_sen) + return out_sens + + def convert_bibentry_to_metadata(self, bibentry): + metadata = {} + metadata["Title"] = bibentry["title"] + metadata["Author"] = [] + for author in bibentry.get("authors",[]): + metadata["Author"].append({ + "GivenName":author.get("first",""), + "FamilyName": author.get("last", "") + }) + metadata["Venue"] = bibentry.get("venue","") + metadata["PublicationDate"] = {"Year":str( bibentry.get("year","") )} + return metadata + + + def get_citation_from_paper_metadata(self, paper_metadata ): + author = paper_metadata.get("Author",[]) + title = paper_metadata.get("Title","") + venue = paper_metadata.get("Venue","") + year = paper_metadata.get("PublicationDate",{}).get("Year","") + + author_list = [] + for pos,author_item in enumerate(author): + if pos == 0: + author_list.append( "%s, %s"%( author_item.get("FamilyName",""), author_item.get("GivenName","") ) ) + else: + author_list.append( "%s %s"%( author_item.get("GivenName",""), author_item.get("FamilyName","") ) ) + + if len(author_list)>3: + author_info = author_list[0] + " et al" + elif len(author_list)>1: + author_info = ", ".join( author_list[:-1] ) + ", and " + author_list[-1] + elif len(author_list)==1: + author_info = author_list[0] + else: + author_info = "" + author_info += "." + + title_info = "“"+title.rstrip(".")+".”" + journal_info = venue + if year.strip() != "": + year_info = "(%s)"%(year) + else: + year_info = "" + + citation_text = " ".join(" ".join( [author_info, title_info, journal_info, year_info ] ).split()) +"." + + return citation_text diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py b/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py new file mode 100644 index 0000000..2e1ceb2 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py @@ -0,0 +1,93 @@ +from raw_sqlite_utils import SqliteClient as RawSqliteClient +from normalization_utils import DocumentNormalizer +import json +import time +import numpy as np +import os +from tqdm import tqdm +import argparse + + +def get_papers(paper_id_list, metadata_sql, pdf_parses_sql, only_metadata = False): + + paper_metadata_list = [None if _ is None else json.loads( _["Text"] ) for _ in metadata_sql.get_papers( paper_id_list )] + if only_metadata: + for pos in range(len( paper_metadata_list )): + paper_metadata_list[pos]["pdf_parses"] = {} + + result = paper_metadata_list + else: + ## if we also want to get the pdf parses, we need to first get the real paper id and use the new id to query pdf parses sqlite! + mapped_paper_id_list = [] + for paper_id, paper_metadata in zip( paper_id_list, paper_metadata_list ): + try: + mapped_paper_id = { + "collection":paper_id["collection"], + "id_field":"paper_id", ## here the id_field must be paper_id. This is the only id there is consistent between metadata and pdf_parses! + "id_type":"int", + "id_value":int( paper_metadata["paper_id"] ) + } + except: + mapped_paper_id = None + mapped_paper_id_list.append(mapped_paper_id) + + paper_fullbody_list = [None if _ is None else json.loads( _["Text"] ) for _ in pdf_parses_sql.get_papers( mapped_paper_id_list )] + + for pos in range(len( paper_metadata_list )): + if paper_metadata_list[pos] is None: + continue + if paper_fullbody_list[pos] is None: + paper_metadata_list[pos]["pdf_parses"] = {} + else: + paper_metadata_list[pos]["pdf_parses"] = paper_fullbody_list[pos] + if paper_metadata_list[pos]["abstract"] is None or paper_metadata_list[pos].get("abstract","").strip() == "": + paper_metadata_list[pos]["abstract"] = (" ".join([ para["text"] for para in paper_metadata_list[pos].get("pdf_parses",{}).get("abstract",[]) ])).strip() + + + result = paper_metadata_list + + return result + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("-metadata_raw_sql_path" ) + parser.add_argument("-pdf_parses_raw_sql_path" ) + parser.add_argument("-json_schema_path" ) + parser.add_argument("-output_file_name" ) + parser.add_argument("-output_file_name_suffix", default = "" ) + parser.add_argument("-start", type = int, default = 0 ) + parser.add_argument("-size", type =int, default = 0 ) + parser.add_argument("-collection" ) + parser.add_argument("-batch_size", type = int ) + + args = parser.parse_args() + + args.output_file_name += args.output_file_name_suffix + + output_folder = os.path.dirname( args.output_file_name ) + if not os.path.exists( output_folder ): + os.makedirs( output_folder ) + + metadata_sql = RawSqliteClient( args.metadata_raw_sql_path ) + pdf_parses_sql = RawSqliteClient( args.pdf_parses_raw_sql_path ) + document_normalizer = DocumentNormalizer(args.json_schema_path) + + max_rowid = metadata_sql.get_max_rowid(args.collection) + if args.size == 0: + args.size = max_rowid + + with open( args.output_file_name,"w" ) as fw: + end = min( args.start + args.size, max_rowid) + for pos in tqdm(range( args.start, end , args.batch_size )): + rowid_list = [ {"collection":args.collection,"id_field":"ROWID","id_value":int(_)+1} for _ in range( pos, min(pos +args.batch_size, end ) ) ] + papers = get_papers(rowid_list, metadata_sql, pdf_parses_sql ) + for paper in papers: + if paper is None: + continue + normalized_paper = document_normalizer.normalize( paper ) + if normalized_paper is None: + continue + fw.write( json.dumps( normalized_paper )+"\n" ) \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py b/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py new file mode 100644 index 0000000..f8124b1 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py @@ -0,0 +1,83 @@ +import sqlite3 +import numpy as np +import threading +import json +import time + +class SqliteClient: + def __init__(self, db_address , check_same_thread=False): + self.conn = sqlite3.connect(db_address, check_same_thread = check_same_thread) + self.cur = self.conn.cursor() + self.cur.execute( "SELECT name FROM sqlite_master WHERE type='table'" ) + try: + self.collections = set([_[0] for _ in self.cur.fetchall()] ) + except: + self.collections = set([]) + + + def get_max_rowid(self, collection ): + if collection not in self.collections: + return 0 + self.cur.execute('SELECT max(rowid) From %s'%( collection )) + try: + max_rowid = self.cur.fetchone()[0] + except: + max_rowid = 0 + return max_rowid + + def get_paper( self, collection, id_field, id_value ): + try: + assert collection in self.collections + sql_command = """ + SELECT paper_id, Text FROM %s WHERE %s = %d + """%( collection, id_field, id_value ) + + self.cur.execute(sql_command) + res = self.cur.fetchall() + assert len(res) > 0 + res = res[0] + except: + return None + return {"paper_id":res[0],"Text": res[1]} + + def get_papers( self, paper_id_list ): + papers = [] + for paper_id in paper_id_list: + try: + paper = self.get_paper( paper_id["collection"], paper_id["id_field"], paper_id["id_value"] ) + except: + paper = None + papers.append( paper ) + return papers + + def insert_papers( self, collection, papers ): + starting_id_int = self.get_max_rowid( collection ) +1 + + if collection not in self.collections: + self.cur.execute( "CREATE TABLE %s( paper_id INT, Text TEXT);"%( collection ) ) + self.cur.execute( "CREATE INDEX IF NOT EXISTS paper_id ON %s (paper_id ASC);"%(collection) ) + + self.collections.add(collection ) + values = [] + for paper in papers: + values.append( "(%d,'%s')"%( int(paper["paper_id"]), paper["Text"].replace("'","''") ) ) + values = ",".join( values ) + self.cur.execute( "INSERT INTO %s('paper_id','Text') VALUES %s;"%(collection, values ) ) + self.conn.commit() + + + + def update_paper( self, paper_id, paper_text ): + self.cur.execute("""UPDATE %s + SET Text = '%s' + WHERE + %s = %d; + """ %( paper_id["collection"], + paper_text.replace("'","''"), + paper_id["id_field"], + int(paper_id["id_value"]) + ) ) + self.conn.commit() + + def __del__(self): + self.conn.close() diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt b/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt new file mode 100644 index 0000000..7c9f971 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt @@ -0,0 +1,5 @@ +lxml==4.9.0 +unidecode +nltk==3.7 +jsonschema==4.17.3 +six==1.16.0 \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/run.sh b/backend/1_document_prefetch/src/build_database/S2ORC/run.sh new file mode 100644 index 0000000..323ac10 --- /dev/null +++ b/backend/1_document_prefetch/src/build_database/S2ORC/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source activate my_env + +python build_raw_sqlite_database.py +python build_normalized_sqlite_database.py \ No newline at end of file diff --git a/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile b/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile index 534fc62..2f32c8a 100644 --- a/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile +++ b/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile @@ -1,9 +1,6 @@ FROM document_prefetch_service_pdf_parsing as base -ENV COLLECTION="arXiv" ENV ROOT_DATA_PATH=/app/data -## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env) -ENV NUM_PROCESSES=16 SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"] diff --git a/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py b/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py new file mode 100644 index 0000000..afa15f8 --- /dev/null +++ b/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py @@ -0,0 +1,89 @@ +import os +import json +from tqdm import tqdm +import time +import numpy as np +from glob import glob + +import pickle +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-embedding_index_folder" ) + parser.add_argument("-embedding_index_name_prefix" ) + parser.add_argument("-num_shards", type = int ) + + args = parser.parse_args() + + embedding_index_names = glob( args.embedding_index_folder + "/" + args.embedding_index_name_prefix + "*" ) + embedding_index_names.sort( key = lambda x:int(x.split("_")[-1]) ) + + assert len(embedding_index_names) > 0 + + print("Start loading embeddings ...") + + embed_info = pickle.load(open(embedding_index_names[0],"rb")) + if len(embedding_index_names) == 1: + full_embedding_matrix = embed_info["embedding_matrix"] + full_pos_to_doc_id_mapper = embed_info["pos_to_doc_id_mapper"] + else: + shard_size, embed_dim = embed_info["embedding_matrix"].shape + estimated_max_num_embeddings = max( 200000000, int(shard_size * len(embedding_index_names) * 1.5) ) + + full_embedding_matrix = np.zeros( ( estimated_max_num_embeddings, embed_dim ), dtype = np.float32 ) + full_pos_to_doc_id_mapper = [] + + current_embed_idx = 0 + for count, fname in enumerate(embedding_index_names): + print(count, fname) + embed_info = pickle.load(open(fname,"rb")) + full_embedding_matrix[current_embed_idx:current_embed_idx+embed_info["embedding_matrix"].shape[0] ] = embed_info["embedding_matrix"] + full_pos_to_doc_id_mapper += embed_info["pos_to_doc_id_mapper"] + current_embed_idx = current_embed_idx+embed_info["embedding_matrix"].shape[0] + + full_embedding_matrix = full_embedding_matrix[:current_embed_idx] + print(full_embedding_matrix.shape) + + print("Removing old embeddings ...") + for count, fname in enumerate(embedding_index_names): + os.remove( fname ) + + + print("Start dumping embeddings ...") + new_shard_size = int( np.ceil( full_embedding_matrix.shape[0] / args.num_shards ) ) + + + shard_number = 0 + for pos in range( 0,full_embedding_matrix.shape[0], new_shard_size ): + + print(pos) + embedding_matrix = full_embedding_matrix[pos:pos + new_shard_size] + pos_to_doc_id_mapper = full_pos_to_doc_id_mapper[pos:pos + new_shard_size] + + doc_ids_for_collection = {} + for count, item in enumerate(pos_to_doc_id_mapper): + if item["collection"] not in doc_ids_for_collection: + doc_ids_for_collection[ item["collection"] ] = [ list(), list() ] + doc_ids_for_collection[ item["collection"] ][0].append( item["id_value"] ) + doc_ids_for_collection[ item["collection"] ][1].append( count ) + for collection in doc_ids_for_collection: + doc_ids_for_collection[collection][0] = np.array(doc_ids_for_collection[collection][0]) + doc_ids_for_collection[collection][1] = np.array(doc_ids_for_collection[collection][1]) + + doc_id_to_pos_mapper = {} + for collection in doc_ids_for_collection: + max_doc_id = np.max(doc_ids_for_collection[collection][0]) + doc_id_to_pos_array = np.ones( int(np.ceil( (max_doc_id +1) / 8 ) * 8), dtype = np.int32 ) * (-1) + doc_id_to_pos_array[doc_ids_for_collection[collection][0]] = doc_ids_for_collection[collection][1] + doc_id_to_pos_mapper[collection] = doc_id_to_pos_array + + with open( args.embedding_index_folder + "/" + args.embedding_index_name_prefix + str(shard_number),"wb" ) as f: + pickle.dump({ + "embedding_matrix":embedding_matrix, + "doc_id_to_pos_mapper":doc_id_to_pos_mapper, + "pos_to_doc_id_mapper":pos_to_doc_id_mapper + }, f, -1) + + shard_number +=1 + diff --git a/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py b/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py index e8abbb3..96b6e80 100644 --- a/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py +++ b/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py @@ -17,6 +17,7 @@ ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH") +NUM_PROCESSES = int(os.getenv("NUM_PROCESSES")) NUM_EMBEDDING_INDEX_SHARDS = int(os.getenv("NUM_EMBEDDING_INDEX_SHARDS")) SENT2VEC_MODEL_PATH = os.getenv("SENT2VEC_MODEL_PATH") @@ -29,7 +30,7 @@ parser.add_argument("-text_encoder_model_path", default = SENT2VEC_MODEL_PATH ) parser.add_argument("-start", type = int, default = None) parser.add_argument("-size", type = int, default = None) - parser.add_argument("-n_processes", type = int, default = NUM_EMBEDDING_INDEX_SHARDS ) + parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES ) parser.add_argument("-n_docs_per_process", type = int, default = None ) args = parser.parse_args() @@ -94,6 +95,16 @@ ## make sure all processes have been finished! for t in threads: t.join() + + ## adjust the number of shards for embedding index, as specified by NUM_EMBEDDING_INDEX_SHARDS + ## This is needed because for CPU approximate nearest search, we may need more shards, but for GPU brute-force nearest neighbor search, we only need one or two shards, since too many shards will increase the GPU memory overhead. + + print("Adjusting number of shards for embedding index ...") + subprocess.run( ["python", "adjust_num_shards_for_embedding_index.py", + "-embedding_index_folder", ROOT_DATA_PATH + "/ranking_buffer/embedding_index/", + "-embedding_index_name_prefix", "embedding_index.db_", + "-num_shards", str( NUM_EMBEDDING_INDEX_SHARDS ) + ] ) print("All Done!") diff --git a/backend/1_document_prefetch/src/service_ranking/service.py b/backend/1_document_prefetch/src/service_ranking/service.py index c2bdf63..98a529d 100644 --- a/backend/1_document_prefetch/src/service_ranking/service.py +++ b/backend/1_document_prefetch/src/service_ranking/service.py @@ -33,6 +33,8 @@ IS_PRIVATE_SERVER = int( os.getenv("IS_PRIVATE_SERVER") ) USE_GPU = int( os.getenv("USE_GPU") ) +EMBEDDING_INDEX_PRECISION = os.getenv("EMBEDDING_INDEX_PRECISION") + SERVICE_SUFFIX = os.getenv("SERVICE_SUFFIX") @@ -282,17 +284,15 @@ def update_inverted_index(): parser.add_argument( "-is_private_server", type = int, default = IS_PRIVATE_SERVER ) - parser.add_argument( "-internal_precision", default = "float32" ) + parser.add_argument( "-internal_precision", default = EMBEDDING_INDEX_PRECISION ) parser.add_argument( "-requires_precision_conversion", type = int, default = 1 ) parser.add_argument( "-num_threads_per_shard", type = int, default = 1 ) parser.add_argument( "-normalize_query_embedding", type = int, default = 1 ) args = parser.parse_args() - wait_for_service(ADDRESS_SERVICE_BUILD_INDEX) - #Convert folders to absolute path args.inverted_index_folder = os.path.abspath( args.inverted_index_folder ) args.embedding_index_folder = os.path.abspath( args.embedding_index_folder ) @@ -312,6 +312,9 @@ def update_inverted_index(): else: args.gpu_list = [] + if len(args.gpu_list) == 0: + args.internal_precision = "float32" + mp.set_start_method('spawn') shard_list = os.listdir(args.embedding_index_folder) diff --git a/backend/1_helper_functions/docker-compose.yaml b/backend/1_helper_functions/docker-compose.yaml index 5aa3d90..032c9c9 100644 --- a/backend/1_helper_functions/docker-compose.yaml +++ b/backend/1_helper_functions/docker-compose.yaml @@ -11,8 +11,8 @@ services: networks: - common_network hostname: helper_functions_service - ports: - - 8030:8060 + # ports: + # - 8030:8060 networks: common_network: external: true \ No newline at end of file diff --git a/backend/2_fast_metadata_search/docker-compose.yaml b/backend/2_fast_metadata_search/docker-compose.yaml index ce18867..9161e85 100644 --- a/backend/2_fast_metadata_search/docker-compose.yaml +++ b/backend/2_fast_metadata_search/docker-compose.yaml @@ -8,12 +8,12 @@ services: image: fast_metadata_search_service environment: PYTHONUNBUFFERED: 1 - DUPLICATE_CHECKING_SERVICE_ADDRESS_LIST: http://document_prefetch_service_overall_arxiv:8060/check-duplicate,http://document_prefetch_service_overall_pmcoa:8060/check-duplicate + DUPLICATE_CHECKING_SERVICE_ADDRESS_LIST: http://document_prefetch_service_overall_arxiv:8060/check-duplicate,http://document_prefetch_service_overall_pmcoa:8060/check-duplicate,http://document_prefetch_service_overall_s2orc:8060/check-duplicate networks: - common_network hostname: fast_metadata_search_service - ports: - - 8028:8060 + # ports: + # - 8028:8060 networks: common_network: external: true \ No newline at end of file diff --git a/backend/2_paper_database_service/docker-compose.yaml b/backend/2_paper_database_service/docker-compose.yaml index fb50654..899d499 100644 --- a/backend/2_paper_database_service/docker-compose.yaml +++ b/backend/2_paper_database_service/docker-compose.yaml @@ -7,12 +7,12 @@ services: build: . image: paper_database_service environment: - SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/get-papers,http://document_prefetch_service_overall_pmcoa:8060/get-papers + SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/get-papers,http://document_prefetch_service_overall_pmcoa:8060/get-papers,http://document_prefetch_service_overall_s2orc:8060/get-papers networks: - common_network hostname: paper_database_service - ports: - - 8023:8060 + # ports: + # - 8023:8060 networks: common_network: external: true \ No newline at end of file diff --git a/backend/3_citation_formating/docker-compose.yaml b/backend/3_citation_formating/docker-compose.yaml index 092bc79..c71f2f2 100644 --- a/backend/3_citation_formating/docker-compose.yaml +++ b/backend/3_citation_formating/docker-compose.yaml @@ -12,8 +12,8 @@ services: networks: - common_network hostname: citation_formating_service - ports: - - 8031:8060 + # ports: + # - 8031:8060 networks: common_network: external: true \ No newline at end of file diff --git a/backend/3_citation_generation/docker-compose.yaml b/backend/3_citation_generation/docker-compose.yaml index 78bb7a5..7778c2a 100644 --- a/backend/3_citation_generation/docker-compose.yaml +++ b/backend/3_citation_generation/docker-compose.yaml @@ -15,8 +15,8 @@ services: networks: - common_network hostname: citation_generation_service - ports: - - 8027:8060 + # ports: + # - 8027:8060 networks: common_network: external: true diff --git a/backend/3_document_reranking/Dockerfile b/backend/3_document_reranking/Dockerfile index 4b483e9..03ba017 100644 --- a/backend/3_document_reranking/Dockerfile +++ b/backend/3_document_reranking/Dockerfile @@ -10,6 +10,6 @@ RUN python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo RUN python -c "from transformers import BertTokenizerFast, BertForNextSentencePrediction; model_path = 'scieditor/document-reranking-scibert'; tokenizer = BertTokenizerFast.from_pretrained(model_path); model = BertForNextSentencePrediction.from_pretrained(model_path)" WORKDIR /app/src -COPY . . +# COPY . . CMD [ "bash", "run_service.sh" ] \ No newline at end of file diff --git a/backend/3_document_reranking/docker-compose.yaml b/backend/3_document_reranking/docker-compose.yaml index 476dfea..8d1f2f3 100644 --- a/backend/3_document_reranking/docker-compose.yaml +++ b/backend/3_document_reranking/docker-compose.yaml @@ -14,8 +14,10 @@ services: networks: - common_network hostname: document_reranking_service - ports: - - 8024:8060 + # ports: + # - 8024:8060 + volumes: + - .:/app/src networks: common_network: external: true diff --git a/backend/3_document_reranking/service.py b/backend/3_document_reranking/service.py index b97cd4c..80c1d33 100644 --- a/backend/3_document_reranking/service.py +++ b/backend/3_document_reranking/service.py @@ -18,9 +18,10 @@ import re from nltk.tokenize import RegexpTokenizer - + import nltk from nltk.corpus import stopwords +nltk.download('omw-1.4') stopwords_set = set(stopwords.words('english')) import GPUtil @@ -124,7 +125,6 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ): if ranking_source.strip() == "" and keywords.strip() == "": return paper_id_list - print(ranking_source, keywords ) tic = time.time() @@ -146,10 +146,11 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ): """ handle the exact match when users use the title to search for a paper """ try: sorted_paper_id_list, sorted_sims = rerank_based_on_query2section_similarity( paper_id_list, ranking_source, return_similarity = True ) + prefix_papers = [] - for pos in range( min( len(sorted_paper_id_list), 5 ) ): + for pos in range( min( len(sorted_paper_id_list), 10 ) ): sorted_paper_id, sim = sorted_paper_id_list[pos], sorted_sims[pos] - if sim > 0.98: + if sim > 0.9: prefix_papers.append( sorted_paper_id ) except: prefix_papers = [] @@ -157,7 +158,7 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ): for paper_id in selected_papers_to_be_reranked: matched = False for pid in prefix_papers: - if paper_id == pid: + if paper_id == pid: matched = True break if not matched: @@ -174,7 +175,7 @@ def document_rerank(): global sem sem.acquire() - + try: if not request.json: assert False @@ -204,7 +205,7 @@ def document_rerank(): sem.release() abort(400) - sem.release() + sem.release() return json.dumps(json_out), 201 diff --git a/backend/3_extractive_summarization/docker-compose.yaml b/backend/3_extractive_summarization/docker-compose.yaml index ce63a6f..376693b 100644 --- a/backend/3_extractive_summarization/docker-compose.yaml +++ b/backend/3_extractive_summarization/docker-compose.yaml @@ -15,8 +15,8 @@ services: networks: - common_network hostname: extractive_summarization_service - ports: - - 8026:8060 + # ports: + # - 8026:8060 networks: common_network: external: true diff --git a/backend/4_document_search_overall/Dockerfile b/backend/4_document_search_overall/Dockerfile index 4b483e9..03ba017 100644 --- a/backend/4_document_search_overall/Dockerfile +++ b/backend/4_document_search_overall/Dockerfile @@ -10,6 +10,6 @@ RUN python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo RUN python -c "from transformers import BertTokenizerFast, BertForNextSentencePrediction; model_path = 'scieditor/document-reranking-scibert'; tokenizer = BertTokenizerFast.from_pretrained(model_path); model = BertForNextSentencePrediction.from_pretrained(model_path)" WORKDIR /app/src -COPY . . +# COPY . . CMD [ "bash", "run_service.sh" ] \ No newline at end of file diff --git a/backend/4_document_search_overall/docker-compose.yaml b/backend/4_document_search_overall/docker-compose.yaml index 18b6e40..2a46fab 100644 --- a/backend/4_document_search_overall/docker-compose.yaml +++ b/backend/4_document_search_overall/docker-compose.yaml @@ -4,18 +4,24 @@ version: '3' services: document_search_overall_service: + volumes: + - .:/app/src build: . image: document_search_overall_service environment: PYTHONUNBUFFERED: 1 PAPER_DATABASE_SERVICE_ADDRESS: http://paper_database_service:8060/get-papers RERANK_SERVICE_ADDRESS: http://document_reranking_service:8060/document-rerank - PREFETCH_SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/document-search,http://document_prefetch_service_overall_pmcoa:8060/document-search + PREFETCH_SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/document-search,http://document_prefetch_service_overall_s2orc:8060/document-search,http://document_prefetch_service_overall_pmcoa:8060/document-search + # ,http://document_prefetch_service_overall_pmcoa:8060/document-search + # ,http://document_prefetch_service_overall_s2orc:8060/document-search networks: - common_network hostname: document_search_overall_service - ports: - - 8025:8060 + # ports: + # - 8025:8060 networks: common_network: - external: true \ No newline at end of file + external: true + + \ No newline at end of file diff --git a/backend/4_document_search_overall/service.py b/backend/4_document_search_overall/service.py index 155e90d..b6ce4a7 100644 --- a/backend/4_document_search_overall/service.py +++ b/backend/4_document_search_overall/service.py @@ -32,15 +32,6 @@ def get_papers( paper_list, projection = None ): headers = {"Content-Type": "application/json", 'Connection': 'close'} ).json()["response"] return results -def get_sentence_list_from_parsed( parsed ): - sentence_list = [] - for section in parsed: - sentence_list.append(str(section.get( "section_title", "" ))) - for para in section.get("section_text",[]): - for sen in para.get("paragraph_text", []): - sentence_list.append( str(sen.get("sentence_text","")) ) - return sentence_list - """ current version of request information. 'ranking_id_value': '', 'ranking_id_field': '', 'ranking_id_type': '', @@ -169,30 +160,30 @@ def remove_duplicate( paper_id_list ): return [paper_id_list[idx] for idx in sorted(doc_indices_wo_duplicates) ] -def get_section_text_list( paper, top_n_sections = None ): - if paper is None: - paper = {} - title = paper.get("Title","") - abstract_parsed = paper.get("Content",{}).get("Abstract_Parsed",[]) - fullbody_parsed = paper.get("Content",{}).get("Fullbody_Parsed",[]) - fulltext_parsed = abstract_parsed + fullbody_parsed - - section_text_list = [title] - for section in fulltext_parsed: - section_text = "" +def get_sentence_list_from_parsed( parsed ): + sentence_list = [] + for section in parsed: + sentence_list.append(str(section.get( "section_title", "" ))) for para in section.get("section_text",[]): - for sen in para.get("paragraph_text",[]): - section_text += sen.get("sentence_text", "") + " " - section_text_list.append( section_text ) + for sen in para.get("paragraph_text", []): + sentence_list.append( str(sen.get("sentence_text","")) ) + return sentence_list + +def parse_document( doc_data ): + ngram_set = set() + + ## Title + title = str(doc_data.get("Title", "")).strip() + ## Abstract + abstract_sen_list = get_sentence_list_from_parsed(doc_data.get( "Content", {} ).get( "Abstract_Parsed", [] )) + ## Fullbody + fullbody_sen_list = get_sentence_list_from_parsed(doc_data.get( "Content", {} ).get( "Fullbody_Parsed", [] )) - if top_n_sections is not None: - section_text_list = section_text_list[:top_n_sections] + sen_list = [ title ] + abstract_sen_list + fullbody_sen_list + ## no need to tokenize here, since it is done internally within sentence ranker + doc_text = " ".join( sen_list ) - return section_text_list - -def get_doc_text( paper ): - section_text_list = get_section_text_list(paper) - return " ".join(section_text_list) + return doc_text def rank_based_on_query_to_doc_similarity( paper_id_list, ranking_source, nResults = None ): global sentence_ranker @@ -202,14 +193,14 @@ def rank_based_on_query_to_doc_similarity( paper_id_list, ranking_source, nResul tic = time.time() - paper_content_list = get_papers( paper_id_list, { "Title":1, "Content.Abstract_Parsed":1, "Content.Fullbody_Parsed": 1 } ) + paper_content_list = get_papers( paper_id_list ) if len(paper_content_list) != len(paper_id_list): return paper_id_list print( "load paper time:", time.time() - tic ) try: - doc_text_list = [ get_doc_text( paper_content ) for paper_content in paper_content_list ] + doc_text_list = [ parse_document( paper_content ) for paper_content in paper_content_list ] _, doc_indices = sentence_ranker.rank_sentences( ranking_source, doc_text_list ) selected_papers_to_be_reranked = [ paper_id_list[idx] for idx in doc_indices ] @@ -267,6 +258,13 @@ def document_search(): requires_reranking = request_info.get( "requires_reranking", True ) reranking_method = request_info.get( "reranking_method", "scibert" ) + + requires_removing_duplicates = True + requires_additional_prefetching = True + requires_reranking = True + reranking_method = "scibert" + + ## prefetch results from a list of prefetching document search servers prefetched_paper_id_list, nMatchingDocuments = prefetch( ranking_source + " " + keywords.replace( "", " " ).replace( "", " " ).replace( "", " " ), @@ -299,6 +297,7 @@ def document_search(): abort(400) sem.release() + return json.dumps(json_out), 201 diff --git a/backend/5_title_generic_search/docker-compose.yaml b/backend/5_title_generic_search/docker-compose.yaml index 98e54a8..1b60cd6 100644 --- a/backend/5_title_generic_search/docker-compose.yaml +++ b/backend/5_title_generic_search/docker-compose.yaml @@ -14,8 +14,8 @@ services: networks: - common_network hostname: title_generic_search_service - ports: - - 8029:8060 + # ports: + # - 8029:8060 networks: common_network: external: true \ No newline at end of file diff --git a/backend/Documentation of Microservices.ipynb b/backend/Documentation of Microservices.ipynb index c2536cc..c574627 100644 --- a/backend/Documentation of Microservices.ipynb +++ b/backend/Documentation of Microservices.ipynb @@ -317,17 +317,29 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "516e216d-1db8-4489-b948-e32b0a281289", "metadata": { "tags": [] }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'query_id': '7368e3be-1890-441e-a796-2ce04c9ec969', 'response': [{'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 798}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 499}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 664}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 251}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 38}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 722}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 214}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 504}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1011}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 638}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 346}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 666}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 905}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 237}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 708}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 712}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 340}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1007}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 787}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 535}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 404}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 700}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 461}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 784}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 212}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 12}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 143}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 640}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 292}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 843}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 44}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 557}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 547}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 813}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 450}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 822}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 23}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 752}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 339}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 750}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 530}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 786}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 251}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 781}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1027}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 502}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 437}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 639}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 77}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 704}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 331}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 870}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 993}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 641}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1006}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 732}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 645}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 324}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 680}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 89}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 808}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 32}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 966}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 554}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 505}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 5}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 815}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 190}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 391}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 359}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 740}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 768}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 466}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 624}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 97}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 337}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 869}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 589}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 246}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 181}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 725}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 858}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 847}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1020}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1019}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 855}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 67}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 501}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 650}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 898}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 174}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 586}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 77}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 979}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 307}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 394}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 670}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 986}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 112}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 610}], 'search_stats': {'DurationTotalSearch': 9200, 'nMatchingDocuments': 143}}\n" + "ename": "JSONDecodeError", + "evalue": "[Errno Expecting value] \n\n400 Bad Request\n

Bad Request

\n

The browser (or proxy) sent a request that this server could not understand.

\n: 0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 909\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 910\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcomplexjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 911\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mJSONDecodeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 347\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 348\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 349\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 356\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_2116499/1365860442.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m output = requests.post( api_gateway_address+\"/ml-api/doc-search/v1.0\", \n\u001b[1;32m 8\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0mquery_data\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m headers = {\"Content-Type\": \"application/json\"} ).json()\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 916\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 917\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mJSONDecodeError\u001b[0m: [Errno Expecting value] \n\n400 Bad Request\n

Bad Request

\n

The browser (or proxy) sent a request that this server could not understand.

\n: 0" ] } ], @@ -995,7 +1007,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1009,7 +1021,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.7.13" } }, "nbformat": 4,