diff --git a/README.md b/README.md
index 1990edf..e6343ca 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,32 @@ node -v
 ```
 
 ## Prepare Raw Corpus
-Here we demonstrated building the search engine on papers from PubMed Open Access (PMCOA) and arXiv.
+Here we demonstrated building the search engine on papers from S2ORC, PubMed Open Access (PMCOA) and arXiv.
+### S2ORC
+S2ORC can be accessed via [semantic scholar api](https://www.semanticscholar.org/product/api). Here we only download a tiny subset to demonstrate the pipeline. 
+
+```bash
+mkdir -p backend/1_document_prefetch/data/S2ORC/raw/metadata
+mkdir -p backend/1_document_prefetch/data/S2ORC/raw/pdf_parses
+wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_0.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_1.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_0.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_1.jsonl
+
+```
+This is how the files are organized:
+```
+backend/1_document_prefetch/data/S2ORC/
+└── raw
+    ├── metadata
+    │   ├── metadata_0.jsonl
+    │   └── metadata_1.jsonl
+    └── pdf_parses
+        ├── pdf_parses_0.jsonl
+        └── pdf_parses_1.jsonl
+```
+
+
 ### PMCOA
 We can download the .tar.gz files from the official FTP service https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/, and put the files into the folder:
 ```
@@ -208,6 +233,7 @@ cd $BASE_DIR/backend/5_title_generic_search && docker-compose up --build -d
 cd $BASE_DIR/backend/final_api_gateway && docker-compose up --build -d
 
 cd $BASE_DIR
+
 ```
 
 By default, port 8060 is used by the final API gateway to communicate with the frontend or API developers.
diff --git a/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml b/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml
index 43d93a7..783e33a 100644
--- a/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml
+++ b/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml
@@ -12,5 +12,6 @@ services:
         image: document_prefetch_build_database_pmcoa
         environment:
             NUM_PROCESSES: 100
+            COLLECTION: PMCOA
         volumes:
             - $PWD/data/PMCOA:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml b/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml
new file mode 100644
index 0000000..03e6209
--- /dev/null
+++ b/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml
@@ -0,0 +1,17 @@
+version: '3'
+
+services:
+    
+    paper_database_manager:
+        build: ./src/modules/paper_database
+        image: paper_database_manager
+        command: ["echo","hello"]
+        
+    document_prefetch_build_database_s2orc:
+        build: ./src/build_database/S2ORC
+        image: document_prefetch_build_database_s2orc
+        environment:
+            NUM_PROCESSES: 100
+            COLLECTION: S2ORC
+        volumes:
+            - $PWD/data/S2ORC:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml b/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml
index 1c6004d..f55136e 100644
--- a/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml
+++ b/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml
@@ -17,5 +17,6 @@ services:
         image: document_prefetch_build_database_arxiv
         environment:
             NUM_PROCESSES: 100
+            COLLECTION: arXiv
         volumes:
             - $PWD/data/arXiv:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-document-prefetch.yaml b/backend/1_document_prefetch/docker-compose-document-prefetch.yaml
index 04f3ad7..7967155 100644
--- a/backend/1_document_prefetch/docker-compose-document-prefetch.yaml
+++ b/backend/1_document_prefetch/docker-compose-document-prefetch.yaml
@@ -2,9 +2,11 @@ version: '3'
 
 ### environment variables
 # NUM_PROCESSES
+# NUM_EMBEDDING_INDEX_SHARDS
 # DATA_PATH
 # PRIVATE
 # USE_GPU
+# EMBEDDING_INDEX_PRECISION
 # SERVICE_SUFFIX
 
 services:
@@ -21,7 +23,8 @@ services:
         depends_on:
             - document_prefetch_base
         environment:
-            NUM_EMBEDDING_INDEX_SHARDS: ${NUM_PROCESSES}
+            NUM_PROCESSES: ${NUM_PROCESSES}
+            NUM_EMBEDDING_INDEX_SHARDS: ${NUM_EMBEDDING_INDEX_SHARDS}
             NUM_INVERTED_INDEX_SHARDS: 10
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
         volumes:
@@ -40,6 +43,7 @@ services:
             NVIDIA_VISIBLE_DEVICES: all
             IS_PRIVATE_SERVER: ${PRIVATE}
             USE_GPU: ${USE_GPU}
+            EMBEDDING_INDEX_PRECISION: ${EMBEDDING_INDEX_PRECISION}
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
         volumes:
             - ${DATA_PATH}:/app/data
@@ -89,8 +93,8 @@ services:
         image: document_prefetch_service_overall
         environment:
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
-        ports:
-            - ${PORT}:8060
+        # ports:
+        #     - ${PORT}:8060
         networks:
             - common_network
         hostname: document_prefetch_service_overall_${SERVICE_SUFFIX}
diff --git a/backend/1_document_prefetch/script_build_all_databases.sh b/backend/1_document_prefetch/script_build_all_databases.sh
index 953c642..c1396d6 100644
--- a/backend/1_document_prefetch/script_build_all_databases.sh
+++ b/backend/1_document_prefetch/script_build_all_databases.sh
@@ -4,3 +4,5 @@ docker-compose -f docker-compose-build-database-arXiv.yaml up --build
 
 docker-compose -f docker-compose-build-database-PMCOA.yaml up --build 
 
+docker-compose -f docker-compose-build-database-S2ORC.yaml up --build 
+
diff --git a/backend/1_document_prefetch/script_start_all_services.sh b/backend/1_document_prefetch/script_start_all_services.sh
index 88fb601..9fb044b 100644
--- a/backend/1_document_prefetch/script_start_all_services.sh
+++ b/backend/1_document_prefetch/script_start_all_services.sh
@@ -1,9 +1,18 @@
 #!/bin/bash
 
+####### 
+# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores
+# NUM_EMBEDDING_INDEX_SHARDS:  Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs
+# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32  . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically
+
 
 #### prefetch server on arXiv
-DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8021 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  up --build -d
+DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  up --build -d
 
 
 #### prefetch server on PMCOA
-DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8022 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  up --build -d
+DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4  SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  up --build -d
+
+
+#### prefetch server on S2ORC
+DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4  SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc  up --build -d
diff --git a/backend/1_document_prefetch/script_stop_all_services.sh b/backend/1_document_prefetch/script_stop_all_services.sh
new file mode 100644
index 0000000..9107137
--- /dev/null
+++ b/backend/1_document_prefetch/script_stop_all_services.sh
@@ -0,0 +1,3 @@
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  down
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  down
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc  down
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile b/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile
index a9f1928..13843de 100644
--- a/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile
+++ b/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile
@@ -1,9 +1,6 @@
 FROM paper_database_manager as base
 
-ENV COLLECTION="PMCOA"
 ENV ROOT_DATA_PATH=/app/data
-## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env)
-ENV NUM_PROCESSES=16
 
 SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
 
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile b/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile
new file mode 100644
index 0000000..c23a987
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile
@@ -0,0 +1,14 @@
+FROM paper_database_manager as base
+
+ENV ROOT_DATA_PATH=/app/data
+
+SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
+
+WORKDIR /app/src
+COPY . .
+
+RUN pip install -r requirements.txt
+
+## Note: when calling docker run, one must map the host machine's volume to /app/data
+## The host volume is expected to contain all the data needed for the search engine
+CMD [ "bash", "run.sh" ]
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py
new file mode 100644
index 0000000..07bc70c
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py
@@ -0,0 +1,124 @@
+import subprocess
+import threading
+from tqdm import tqdm
+import os
+import numpy as np
+from raw_sqlite_utils import SqliteClient as RawSqliteClient
+import time
+import json
+from modules.paper_database.database_managers import SqliteClient
+import shutil
+
+
+# import os,sys,inspect
+# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+# root_dir = os.path.dirname(os.path.dirname(current_dir))
+# sys.path.insert(0, root_dir)
+# sys.path.insert(0, current_dir)
+
+
+import argparse
+
+
+### get all needed environment variables
+ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
+COLLECTION = os.getenv("COLLECTION")
+NUM_PROCESSES = int(os.getenv("NUM_PROCESSES"))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-metadata_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db" )
+    parser.add_argument("-pdf_parses_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db" )
+    parser.add_argument("-json_schema_path", default = "json_schema.json" )
+    parser.add_argument("-output_file_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/normalized_data.jsonl" )
+    parser.add_argument("-start", type = int, default = None )
+    parser.add_argument("-size", type =int, default = None )
+    parser.add_argument("-collection", default = COLLECTION )
+    parser.add_argument("-batch_size", type = int, default = 5000 )
+    parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES )
+    parser.add_argument("-output_sqlite_database_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/DB.db" )
+    
+    args = parser.parse_args()
+    
+    metadata_sql = RawSqliteClient( args.metadata_raw_sql_path )
+    if args.start is None or args.size is None:
+        print("No proper start and size value are specified, processing the whole document ...")
+        print("Counting the total number of examples ...")
+        args.start = 0
+        args.size = metadata_sql.get_max_rowid(args.collection)
+    else:
+        try:
+            assert args.start is not None and args.size is not None
+            assert args.start >= 0 and args.size >= 0
+        except:
+            print("Error: Wrong start and size value were provided!")
+            os.sys.exit(1)
+            
+    output_folder = os.path.dirname( args.output_file_name )
+    try:
+        shutil.rmtree( output_folder )
+    except:
+        pass
+    os.makedirs( output_folder )
+        
+    output_sqlite_database_folder = os.path.dirname( args.output_sqlite_database_name )
+    try:
+        shutil.rmtree( output_sqlite_database_folder )
+    except:
+        pass
+    os.makedirs( output_sqlite_database_folder )
+        
+    num_of_examples_per_process = int( np.ceil( args.size / args.n_processes ) )
+    print("Start multiple subprocesses ...")
+    
+    threads = []
+    for offset in range( args.start, args.start + args.size, num_of_examples_per_process ):
+        t = threading.Thread( target = subprocess.run, args = ( 
+                list(map( str, [
+                    "python",
+                    "normalize_raw_sqlite.py",
+                    "-metadata_raw_sql_path", args.metadata_raw_sql_path,
+                    "-pdf_parses_raw_sql_path", args.pdf_parses_raw_sql_path,
+                    "-json_schema_path", args.json_schema_path ,
+                    "-output_file_name", args.output_file_name,
+                    "-output_file_name_suffix", "_%d"%( offset ),
+                    "-start", offset,
+                    "-size", min(num_of_examples_per_process, args.start + args.size -  offset ),
+                    "-collection", args.collection,
+                    "-batch_size", args.batch_size
+                   ] ) ) ,
+             )  )
+        threads.append(t)
+        t.start()
+    for t in threads:
+        t.join()
+    
+    
+    print("Dumping to the final sqlite database, this may take time ...")
+    
+    final_sql = SqliteClient( args.output_sqlite_database_name )
+    
+    output_base_name = os.path.basename( args.output_file_name )
+    flist =[ output_folder +"/"+fname for fname in os.listdir( output_folder ) if fname.startswith(output_base_name+"_") ]
+    flist.sort( key = lambda x:int(x.split("_")[-1]) )
+    
+    paper_buffer = []
+    for fname in flist:
+        print(fname)
+        with open( fname ,"r" ) as f:
+            for line in f:
+                line_data = json.loads(line)
+                paper_buffer.append(line_data)
+                
+                if len(paper_buffer) >= args.batch_size:
+                    final_sql.insert_papers( paper_buffer, args.collection )
+                    paper_buffer = []
+        os.remove( fname )
+            
+    if len(paper_buffer)>0:
+        final_sql.insert_papers( paper_buffer, args.collection )
+        paper_buffer = []
+                
+    print("All Done!")
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py
new file mode 100644
index 0000000..cd79448
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py
@@ -0,0 +1,58 @@
+from raw_sqlite_utils import SqliteClient
+import numpy as np
+import json
+import time
+import os
+from tqdm import tqdm
+import re
+import argparse
+
+def dump_to_sqlite( folder, db_path, buffer_size, paper_id_matcher, collection ):
+    db_path_dir_name = os.path.dirname(db_path)
+    if not os.path.exists( db_path_dir_name ):
+        os.makedirs( db_path_dir_name )
+
+    flist = [folder + "/" + _ for _ in  os.listdir(folder) if _.endswith(".jsonl") ]
+    flist.sort( key = lambda x:int( x.split("_")[-1].split(".")[0] ) )
+
+    sql_client = SqliteClient(db_path)
+
+    paper_list_buffer = []
+    for fname in flist:
+        print(fname)
+        with open( fname,"r" ) as f:
+            for line in tqdm(f):                
+                paper_id = int(paper_id_matcher.findall(line[:50])[0])
+                paper_list_buffer.append( { "paper_id":paper_id,"Text":line } )
+                if len(paper_list_buffer) >= buffer_size:
+                    sql_client.insert_papers( collection, paper_list_buffer )
+                    paper_list_buffer = []
+    if len( paper_list_buffer ) > 0:
+        sql_client.insert_papers( collection, paper_list_buffer )
+        paper_list_buffer = []
+        
+ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
+COLLECTION = os.getenv("COLLECTION")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-metadata_jsonl_folder", default = ROOT_DATA_PATH + "/raw/metadata/" )
+    parser.add_argument("-metadata_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db")
+    parser.add_argument("-pdf_parses_jsonl_folder", default = ROOT_DATA_PATH + "/raw/pdf_parses/")
+    parser.add_argument("-pdf_parses_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db")
+    parser.add_argument("-buffer_size", type = int, default = 1000 )
+    parser.add_argument("-collection", default = COLLECTION)
+    args = parser.parse_args()
+
+
+    paper_id_matcher = re.compile('(?<="paper_id": ")\d*(?=")') 
+
+    print("Converting metadata raw jsonl files to a single metadata sqlite ...")
+    dump_to_sqlite( args.metadata_jsonl_folder, args.metadata_db_path, args.buffer_size, paper_id_matcher, args.collection )
+
+    print("Converting pdf_parses raw jsonl files to a single metadata sqlite ...")
+    dump_to_sqlite( args.pdf_parses_jsonl_folder, args.pdf_parses_db_path, args.buffer_size, paper_id_matcher, args.collection )
+        
+    print("All Done!")
+        
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json b/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json
new file mode 100644
index 0000000..97c33be
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json
@@ -0,0 +1 @@
+{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } }  }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } }   }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]}
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py b/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py
new file mode 100644
index 0000000..f8cb3ba
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/normalization_utils.py
@@ -0,0 +1,424 @@
+import re
+import time
+import numpy as np
+from jsonschema import validate
+from nltk.tokenize import sent_tokenize
+import json
+class DocumentNormalizer:
+    def __init__(self, json_schema_path ):
+        self.json_schema = json.load(open(json_schema_path,"r"))
+        self.cit_marker_matcher = re.compile("(^[^A-Za-z\d]*)([0-9]+)(?=[^A-Za-z\d]*$)")
+        self.sentence_boundary_matcher = re.compile("\.\s")
+    
+    def normalize( self, paper, requires_validation = True ):
+        ##### Author #####
+        parsed_authors = self.parse_author( paper )
+        ##### Title #####
+        parsed_title = self.parse_title( paper )
+        ##### Venue #####
+        parsed_venue = self.parse_venue( paper )
+        ##### DOI #####
+        parsed_doi = self.parse_doi(paper)
+        ##### URL #####
+        parsed_url = self.parse_url(paper)
+        ##### PublicationDate #####
+        parsed_pub_date = self.parse_pub_date(paper)
+        ##### Reference #####
+        parsed_reference, bib_entry_key_to_row_id_mapper = self.parse_reference(paper)
+        
+        ##### Content #####
+        parsed_content = self.parse_content(paper, bib_entry_key_to_row_id_mapper)
+        ##### Abstract (The abstract text stored in the metadata) #####
+        abstract_text = (" ".join(self.get_sentence_list_from_parsed_sections( parsed_content["Abstract_Parsed"] ))).strip()
+        
+        ##### Last_update_unixtime ######
+        Last_update_unixtime = int(time.time())
+        ##### Others #####
+        Abstract_in_metadata = abstract_text != ""
+        isDuplicated = False
+        
+        normalized_paper = {
+            "Author":parsed_authors,
+            "Title":parsed_title,
+            "Abstract":abstract_text,
+            "Venue":parsed_venue,
+            "DOI":parsed_doi,
+            "URL":parsed_url,
+            "PublicationDate":parsed_pub_date,
+            "Content":parsed_content,
+            "Reference":parsed_reference,
+            "Last_update_unixtime":Last_update_unixtime,
+            "Abstract_in_metadata":Abstract_in_metadata,
+            "isDuplicated":isDuplicated
+        }
+        
+        ##### Additional IDs, this is only added for S2ORC dataset #####
+        additional_ids = self.parse_additional_ids(paper)
+        normalized_paper.update( additional_ids )
+        
+        if requires_validation:
+            try:
+                validate(instance=normalized_paper, schema=self.json_schema)
+            except:
+                return None
+        return normalized_paper
+    
+    def get_sentence_list_from_parsed_sections(self, parsed_sections ):
+        sentence_list = []
+        for section in parsed_sections:
+            sentence_list.append(str(section.get( "section_title", "" )))
+            for para in section.get("section_text",[]):
+                for sen in para.get("paragraph_text", []):
+                    sentence_list.append( str(sen.get("sentence_text","")) )
+        return sentence_list
+    
+    
+    def parse_author(self, paper ):
+        try:
+            parsed_authors = []
+            authors = paper.get("authors", [] )
+            for author in authors:
+                parsed_authors.append(
+                    {
+                        "GivenName":str( author.get( "first", "" ).replace("None","") ),
+                        "FamilyName":str( author.get( "last", "" ).replace("None","") )
+                    }
+                )
+        except:
+            parsed_authors = []
+        return parsed_authors
+    
+    def parse_title(self, paper ):
+        try:
+            parsed_title = str(paper.get("title", "")).replace("None","").lstrip("[").rstrip("]")
+        except:
+            parsed_title = ""
+        return parsed_title
+    
+    def parse_venue(self, paper):
+        try:
+            parsed_venue = str(paper.get("venue", "")).replace("None","")
+        except:
+            parsed_venue = ""
+        
+        if parsed_venue.strip() == "":
+            try:
+                parsed_venue = str(paper.get("journal","")).replace("None","")
+            except:
+                parsed_venue = ""
+        return parsed_venue
+    
+    def parse_doi(self, paper):
+        try:
+            parsed_doi = str( paper.get("doi","") ).replace("None","")
+        except:
+            parsed_doi = ""
+        return parsed_doi
+    
+    def parse_url(self, paper):
+        try:
+            parsed_doi = str(paper.get("doi","")).strip().replace("%", "%25").replace('"', "%22").replace("#", "%23").replace(" ", "%20").replace("?", "%3F").replace("None","")
+            if parsed_doi.strip() != "":
+                parsed_url = "https://doi.org/" + parsed_doi
+            else:
+                parsed_url = str(paper.get("s2_url", ""))
+        except:
+            parsed_url = ""
+        return parsed_url
+        
+    
+    def parse_pub_date( self, paper ):
+        try:
+            year = str(int(paper.get("year", ""))).replace("None","")
+        except:
+            year = ""
+        return {
+            "Year":year
+        }
+
+    def parse_para( self, para, bib_entry_key_to_row_id_mapper ):
+        paragraph_text = [{ "sentence_id":str(sen_id), "sentence_text": str(sen), "cite_spans":[] }  
+                          for sen_id, sen in  enumerate(self.sent_tok( str(para.get("text",""))) )]
+        para_cite_spans = para.get( "cite_spans", [] )
+        for cite_span in para_cite_spans:
+            start, end = cite_span["start"], cite_span["end"]
+            for sen in paragraph_text:
+                if start < len( sen["sentence_text"] ):
+                    end = min( end, len( sen["sentence_text"] ) )
+                    sen["cite_spans"].append(
+                        {
+                            "start":start,
+                            "end":end,
+                            "text":sen["sentence_text"][start:end],
+                            "ref_id":cite_span["ref_id"]
+                        }
+                    )
+                    break
+                else:
+                    start -= len( sen["sentence_text"] )
+                    end -= len( sen["sentence_text"] )
+        cleaned_paragraph_text = []
+        for sen in paragraph_text:
+            sentence_text = sen["sentence_text"]
+            cite_spans = sen["cite_spans"]
+            
+            sentence_text = sentence_text.rstrip()
+            
+            cite_spans.sort( key= lambda x:x["start"] )
+            
+            cleaned_cite_spans = []
+            for sen_cite_span in cite_spans:
+                if sen_cite_span["ref_id"] not in bib_entry_key_to_row_id_mapper:
+                    continue
+                
+                start, end = sen_cite_span["start"], sen_cite_span["end"]
+                ## make sure ther is no overlapping between multiple citation markers
+                if len(cleaned_cite_spans) > 0 and start < int(cleaned_cite_spans[-1]["end"]):
+                    continue
+                
+                if start >= len(sentence_text):
+                    continue
+                end = min( end, len(sentence_text) )
+                
+                sen_cite_span["start"] = str(start)
+                sen_cite_span["end"] = str(end)
+                sen_cite_span["text"] = sentence_text[start:end]
+                sen_cite_span["ref_id"] = str(bib_entry_key_to_row_id_mapper[ sen_cite_span["ref_id"] ])
+                
+                cleaned_cite_spans.append( sen_cite_span )
+            
+            sentence_id = str(len(cleaned_paragraph_text))
+            cleaned_paragraph_text.append(
+                {
+                    "sentence_id":sentence_id,
+                    "sentence_text":sentence_text,
+                    "cite_spans":cleaned_cite_spans
+                }
+            )
+            
+        return cleaned_paragraph_text
+    
+    
+    def parse_para_list( self, para_list, bib_entry_key_to_row_id_mapper ):
+        section_list = []
+        current_section = None
+        
+        for para in para_list:            
+            paragraph_text = self.parse_para( para, bib_entry_key_to_row_id_mapper )
+            
+            para_section = str(para.get("section",""))
+            
+            if current_section is None or (para_section != "" and para_section != current_section["section_title"]):
+                if current_section is not None:
+                    section_list.append(current_section)
+                current_section = {
+                    "section_id":str(len(section_list)),
+                    "section_title":para_section,
+                    "section_text":[
+                        {
+                            "paragraph_id":"0",
+                            "paragraph_text":paragraph_text
+                        }
+                    ]
+                }
+            else:
+                next_para_id = len(current_section["section_text"])
+                current_section["section_text"].append(
+                    {
+                        "paragraph_id":str(next_para_id),
+                        "paragraph_text":paragraph_text
+                    }
+                )
+        if current_section is not None:
+            section_list.append(current_section)
+            
+        if (" ".join(self.get_sentence_list_from_parsed_sections( section_list ))).strip() == "":
+            section_list = []
+            
+        return section_list
+                
+    def parse_content( self, paper, bib_entry_key_to_row_id_mapper ):
+        ### Abstract 
+        abstract = ""
+        ### Abstract_Parsed
+        try:
+            pdf_parsed_abstract = paper.get("pdf_parses",{}).get("abstract",[])
+            if len( pdf_parsed_abstract ) == 0:
+                abstract_text = str(paper.get("abstract",""))
+                if abstract_text != "None" and abstract_text != "":
+                    pdf_parsed_abstract = [ { "section":"Abstract", "text":abstract_text } ]
+            assert len(pdf_parsed_abstract) > 0
+            
+            abstract_parsed = self.parse_para_list( pdf_parsed_abstract, bib_entry_key_to_row_id_mapper )
+        except:
+            abstract_parsed = []
+        
+        ### Fullbody
+        fullbody = ""
+        
+        ### Fullbody_Parsed
+        try:
+            fullbody_parsed = self.parse_para_list( paper.get( "pdf_parses", {} ).get("body_text", []), bib_entry_key_to_row_id_mapper )
+        except:
+            fullbody_parsed = []
+        return {
+            "Abstract":abstract,
+            "Abstract_Parsed":abstract_parsed,
+            "Fullbody":fullbody,
+            "Fullbody_Parsed":fullbody_parsed
+        }
+    
+    def parse_reference(self, paper):
+        try:
+            bibref_text = {}
+            body_text = paper.get("pdf_parses",{}).get("body_text", [])
+            for para in body_text:
+                for cit in para.get("cite_spans", []):
+                    if isinstance(cit, dict):
+                        ref_id, ref_text = cit.get("ref_id",""), cit.get("text","")
+                        if ref_id != "":
+                            bibref_text[ref_id] = ref_text
+                            
+            for ref_id in bibref_text:
+                ref_text = bibref_text[ref_id]
+                matched_texts = self.cit_marker_matcher.findall(ref_text)
+                if len(matched_texts) > 0:
+                    ref_text = matched_texts[0][1]+"."
+                else:
+                    ref_text = ""
+                bibref_text[ref_id] = ref_text
+                
+        except:
+            bibref_text = {}
+            
+        try:
+            reference = []
+            bib_entry_key_to_row_id_mapper = {}
+            
+            bib_entries = paper.get("pdf_parses",{}).get("bib_entries",{})
+            bib_entry_keys = list(bib_entries.keys())
+            try:
+                bib_entry_keys.sort( key = lambda x : int(x[6:]) )
+            except:
+                pass    
+
+            for bib_entry_key in bib_entry_keys:
+                try:
+                    parsed_entry = self.convert_bibentry_to_metadata( bib_entries[bib_entry_key] )
+                    reference_text = self.get_citation_from_paper_metadata(parsed_entry)
+                    if bibref_text.get(bib_entry_key,"").strip() != "":
+                        reference_text = bibref_text[bib_entry_key] + " "+ reference_text
+                    parsed_entry["ReferenceText"] = reference_text
+                    
+                    bib_entry_key_to_row_id_mapper[bib_entry_key] = len(reference)
+                    reference.append(parsed_entry)
+                except:
+                    continue
+        except:
+            reference = []
+            bib_entry_key_to_row_id_mapper = {}
+
+        return reference, bib_entry_key_to_row_id_mapper
+    
+    def parse_additional_ids(self, paper):
+        try:
+            S2CID = str(paper.get("paper_id", "")).replace("None","")
+            PMID = str(paper.get("pubmed_id", "")).replace("None","")
+            PMCID = str(paper.get("pmc_id", "")).replace("None","")
+            ArxivId = str(paper.get("arxiv_id", "")).replace("None","")
+            ACLId = str(paper.get("acl_id","")).replace("None","")
+            MAGId = str(paper.get("mag_id","")).replace("None","")
+        except:
+            S2CID = ""
+            PMID = ""
+            PMCID = ""
+            ArxivId = ""
+            ACLId = ""
+            MAGId = ""
+        return {
+            "S2CID":S2CID,
+            "PMID":PMID,
+            "PMCID":PMCID,
+            "ArxivId":ArxivId,
+            "ACLId":ACLId,
+            "MAGId":MAGId
+        }
+
+    
+    def sent_tok(self, text, min_sen_len = 10 ):
+
+        sens = self.sentence_boundary_matcher.split( text )
+        for pos in range( len(sens)-1 ):
+            sens[pos] += ". "
+        
+        return self.merge_sens( sens, min_sen_len = min_sen_len )
+
+    def merge_sens(self, sens, min_sen_len = 10 ):
+        out_sens =[]
+        current_sen = None
+    
+        for sen in sens:
+            sen_len = len(sen.split())
+            if sen_len >= min_sen_len:
+                if current_sen is not None:
+                    out_sens.append( current_sen )
+                current_sen = sen
+            else:
+                if current_sen is not None: 
+                    current_sen += sen
+                else:
+                    current_sen = sen
+        if current_sen is not None:
+            if len( current_sen.split() ) < min_sen_len and len( out_sens ) > 0:
+                out_sens[-1] += current_sen
+            else:
+                out_sens.append(current_sen)
+        return out_sens
+    
+    def convert_bibentry_to_metadata(self, bibentry):
+        metadata = {}
+        metadata["Title"] = bibentry["title"]
+        metadata["Author"] = []
+        for author in bibentry.get("authors",[]):
+            metadata["Author"].append({
+                "GivenName":author.get("first",""),
+                "FamilyName": author.get("last", "")
+            })
+        metadata["Venue"] = bibentry.get("venue","")
+        metadata["PublicationDate"] = {"Year":str( bibentry.get("year","") )}
+        return metadata
+
+
+    def get_citation_from_paper_metadata(self,  paper_metadata ):
+        author = paper_metadata.get("Author",[])
+        title = paper_metadata.get("Title","")
+        venue = paper_metadata.get("Venue","")
+        year = paper_metadata.get("PublicationDate",{}).get("Year","")   
+            
+        author_list = []
+        for pos,author_item in enumerate(author):
+            if pos == 0:
+                author_list.append( "%s, %s"%(  author_item.get("FamilyName",""), author_item.get("GivenName","") ) )
+            else:
+                author_list.append( "%s %s"%(  author_item.get("GivenName",""), author_item.get("FamilyName","") ) )    
+
+        if len(author_list)>3:
+            author_info = author_list[0] + " et al"
+        elif len(author_list)>1:
+            author_info = ", ".join( author_list[:-1] ) + ", and " + author_list[-1]
+        elif len(author_list)==1:
+            author_info = author_list[0]
+        else:
+            author_info = ""
+        author_info += "."  
+
+        title_info = "“"+title.rstrip(".")+".”"
+        journal_info = venue
+        if year.strip() != "":
+            year_info = "(%s)"%(year)
+        else:
+            year_info = ""  
+
+        citation_text = " ".join(" ".join( [author_info, title_info, journal_info, year_info  ] ).split()) +"."
+
+        return citation_text
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py b/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py
new file mode 100644
index 0000000..2e1ceb2
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/normalize_raw_sqlite.py
@@ -0,0 +1,93 @@
+from raw_sqlite_utils import SqliteClient as RawSqliteClient
+from normalization_utils import DocumentNormalizer
+import json
+import time
+import numpy as np
+import os
+from tqdm import tqdm
+import argparse
+
+
+def get_papers(paper_id_list, metadata_sql, pdf_parses_sql, only_metadata = False):
+        
+    paper_metadata_list = [None if _ is None else json.loads( _["Text"] ) for _ in metadata_sql.get_papers( paper_id_list )]
+    if only_metadata:
+        for pos in range(len( paper_metadata_list )):
+            paper_metadata_list[pos]["pdf_parses"] = {}
+            
+        result = paper_metadata_list
+    else:
+        ## if we also want to get the pdf parses, we need to first get the real paper id and use the new id to query pdf parses sqlite!
+        mapped_paper_id_list = []
+        for paper_id, paper_metadata in zip( paper_id_list, paper_metadata_list ):
+            try:
+                mapped_paper_id = {
+                    "collection":paper_id["collection"],
+                    "id_field":"paper_id",  ## here the id_field must be paper_id. This is the only id there is consistent between metadata and pdf_parses!
+                    "id_type":"int",
+                    "id_value":int( paper_metadata["paper_id"] )
+                }
+            except:
+                mapped_paper_id = None
+            mapped_paper_id_list.append(mapped_paper_id)
+                    
+        paper_fullbody_list = [None if _ is None else json.loads( _["Text"] ) for _ in pdf_parses_sql.get_papers( mapped_paper_id_list )]
+            
+        for pos in range(len( paper_metadata_list )):
+            if paper_metadata_list[pos] is None:
+                continue
+            if paper_fullbody_list[pos] is None:
+                paper_metadata_list[pos]["pdf_parses"] = {}
+            else:
+                paper_metadata_list[pos]["pdf_parses"] = paper_fullbody_list[pos]
+                if paper_metadata_list[pos]["abstract"] is None or paper_metadata_list[pos].get("abstract","").strip() == "":
+                    paper_metadata_list[pos]["abstract"] = (" ".join([ para["text"] for para in paper_metadata_list[pos].get("pdf_parses",{}).get("abstract",[]) ])).strip()
+                
+                
+        result = paper_metadata_list
+
+    return result
+
+
+if __name__ == "__main__":
+    
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-metadata_raw_sql_path" )
+    parser.add_argument("-pdf_parses_raw_sql_path" )
+    parser.add_argument("-json_schema_path" )
+    parser.add_argument("-output_file_name" )
+    parser.add_argument("-output_file_name_suffix", default = "" )
+    parser.add_argument("-start", type = int, default = 0 )
+    parser.add_argument("-size", type =int, default = 0 )
+    parser.add_argument("-collection" )
+    parser.add_argument("-batch_size", type = int )
+    
+    args = parser.parse_args()
+        
+    args.output_file_name += args.output_file_name_suffix
+    
+    output_folder = os.path.dirname( args.output_file_name )
+    if not os.path.exists( output_folder ):
+        os.makedirs( output_folder )
+    
+    metadata_sql = RawSqliteClient( args.metadata_raw_sql_path )
+    pdf_parses_sql = RawSqliteClient( args.pdf_parses_raw_sql_path )
+    document_normalizer = DocumentNormalizer(args.json_schema_path)
+
+    max_rowid = metadata_sql.get_max_rowid(args.collection)
+    if args.size == 0:
+        args.size = max_rowid 
+
+    with open( args.output_file_name,"w" ) as fw:
+        end = min( args.start + args.size, max_rowid)
+        for pos in tqdm(range( args.start, end , args.batch_size  )):
+            rowid_list = [ {"collection":args.collection,"id_field":"ROWID","id_value":int(_)+1} for _ in range( pos, min(pos +args.batch_size,  end )  ) ]
+            papers = get_papers(rowid_list, metadata_sql, pdf_parses_sql )
+            for paper in papers:
+                if paper is None:
+                    continue
+                normalized_paper = document_normalizer.normalize( paper )
+                if normalized_paper is None:
+                    continue
+                fw.write( json.dumps( normalized_paper )+"\n" )
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py b/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py
new file mode 100644
index 0000000..f8124b1
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/raw_sqlite_utils.py
@@ -0,0 +1,83 @@
+import sqlite3
+import numpy as np
+import threading
+import json
+import time
+
+class SqliteClient:
+    def __init__(self, db_address , check_same_thread=False):
+        self.conn = sqlite3.connect(db_address, check_same_thread = check_same_thread)
+        self.cur = self.conn.cursor()
+        self.cur.execute( "SELECT name FROM sqlite_master WHERE type='table'" )
+        try:
+            self.collections = set([_[0] for _ in self.cur.fetchall()] )
+        except:
+            self.collections = set([])
+    
+    
+    def get_max_rowid(self, collection ):
+        if collection not in self.collections:
+            return 0
+        self.cur.execute('SELECT max(rowid) From %s'%( collection ))
+        try:
+            max_rowid = self.cur.fetchone()[0]
+        except:
+            max_rowid = 0
+        return max_rowid
+
+    def get_paper( self, collection, id_field, id_value  ):
+        try:
+            assert collection in self.collections
+            sql_command = """
+                        SELECT paper_id, Text FROM %s WHERE %s = %d
+                     """%( collection, id_field, id_value  )
+    
+            self.cur.execute(sql_command)
+            res = self.cur.fetchall()
+            assert len(res) > 0
+            res = res[0]
+        except:
+            return None
+        return {"paper_id":res[0],"Text": res[1]}
+    
+    def get_papers( self, paper_id_list ):
+        papers = []
+        for paper_id in paper_id_list:
+            try:
+                paper = self.get_paper( paper_id["collection"], paper_id["id_field"], paper_id["id_value"] )
+            except:
+                paper = None
+            papers.append( paper  )
+        return papers
+            
+    def insert_papers( self, collection,  papers  ):
+        starting_id_int = self.get_max_rowid( collection ) +1
+        
+        if collection not in self.collections:
+            self.cur.execute( "CREATE TABLE %s( paper_id INT, Text TEXT);"%( collection ) )
+            self.cur.execute( "CREATE INDEX IF NOT EXISTS paper_id ON %s (paper_id ASC);"%(collection) )
+            
+            self.collections.add(collection )
+        values = []
+        for paper in papers:
+            values.append( "(%d,'%s')"%( int(paper["paper_id"]), paper["Text"].replace("'","''") ) )
+        values = ",".join( values )
+        self.cur.execute( "INSERT INTO %s('paper_id','Text') VALUES %s;"%(collection, values )  )
+        self.conn.commit()
+        
+    
+    
+    def update_paper( self, paper_id, paper_text ):
+        self.cur.execute("""UPDATE %s 
+                            SET Text = '%s'
+                            WHERE
+                                %s = %d;
+                            """ %( paper_id["collection"],
+                                   paper_text.replace("'","''"),
+                                   paper_id["id_field"],
+                                   int(paper_id["id_value"])
+                                 ) )
+        self.conn.commit()
+    
+    def __del__(self):
+        self.conn.close()
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt b/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt
new file mode 100644
index 0000000..7c9f971
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/requirements.txt
@@ -0,0 +1,5 @@
+lxml==4.9.0
+unidecode
+nltk==3.7
+jsonschema==4.17.3
+six==1.16.0
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/run.sh b/backend/1_document_prefetch/src/build_database/S2ORC/run.sh
new file mode 100644
index 0000000..323ac10
--- /dev/null
+++ b/backend/1_document_prefetch/src/build_database/S2ORC/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+source activate my_env
+
+python build_raw_sqlite_database.py
+python build_normalized_sqlite_database.py
\ No newline at end of file
diff --git a/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile b/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile
index 534fc62..2f32c8a 100644
--- a/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile
+++ b/backend/1_document_prefetch/src/build_database/arXiv/Dockerfile
@@ -1,9 +1,6 @@
 FROM document_prefetch_service_pdf_parsing as base
 
-ENV COLLECTION="arXiv"
 ENV ROOT_DATA_PATH=/app/data
-## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env)
-ENV NUM_PROCESSES=16
 
 SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
 
diff --git a/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py b/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py
new file mode 100644
index 0000000..afa15f8
--- /dev/null
+++ b/backend/1_document_prefetch/src/service_build_index/adjust_num_shards_for_embedding_index.py
@@ -0,0 +1,89 @@
+import os
+import json
+from tqdm import tqdm
+import time
+import numpy as np
+from glob import glob
+
+import pickle
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-embedding_index_folder" )
+    parser.add_argument("-embedding_index_name_prefix" )
+    parser.add_argument("-num_shards", type = int )
+    
+    args = parser.parse_args()
+    
+    embedding_index_names = glob( args.embedding_index_folder + "/" + args.embedding_index_name_prefix + "*" )
+    embedding_index_names.sort( key = lambda x:int(x.split("_")[-1]) )
+    
+    assert len(embedding_index_names) > 0 
+    
+    print("Start loading embeddings ...")
+    
+    embed_info = pickle.load(open(embedding_index_names[0],"rb"))
+    if len(embedding_index_names) == 1:
+        full_embedding_matrix = embed_info["embedding_matrix"]
+        full_pos_to_doc_id_mapper = embed_info["pos_to_doc_id_mapper"]
+    else:
+        shard_size, embed_dim = embed_info["embedding_matrix"].shape
+        estimated_max_num_embeddings = max( 200000000, int(shard_size * len(embedding_index_names) * 1.5) )
+        
+        full_embedding_matrix = np.zeros( ( estimated_max_num_embeddings, embed_dim ), dtype = np.float32 )
+        full_pos_to_doc_id_mapper = []
+        
+        current_embed_idx = 0
+        for count, fname in enumerate(embedding_index_names):
+            print(count, fname)
+            embed_info = pickle.load(open(fname,"rb"))
+            full_embedding_matrix[current_embed_idx:current_embed_idx+embed_info["embedding_matrix"].shape[0] ] = embed_info["embedding_matrix"]
+            full_pos_to_doc_id_mapper += embed_info["pos_to_doc_id_mapper"]
+            current_embed_idx = current_embed_idx+embed_info["embedding_matrix"].shape[0]
+        
+        full_embedding_matrix = full_embedding_matrix[:current_embed_idx]
+        print(full_embedding_matrix.shape)
+        
+    print("Removing old embeddings ...")
+    for count, fname in enumerate(embedding_index_names):
+        os.remove( fname )
+
+        
+    print("Start dumping embeddings ...")
+    new_shard_size = int( np.ceil( full_embedding_matrix.shape[0] / args.num_shards ) )
+    
+    
+    shard_number = 0
+    for pos in range( 0,full_embedding_matrix.shape[0], new_shard_size ):
+    
+        print(pos)
+        embedding_matrix = full_embedding_matrix[pos:pos + new_shard_size]
+        pos_to_doc_id_mapper = full_pos_to_doc_id_mapper[pos:pos + new_shard_size]
+        
+        doc_ids_for_collection = {}
+        for count, item in enumerate(pos_to_doc_id_mapper):
+            if item["collection"] not in doc_ids_for_collection:
+                doc_ids_for_collection[ item["collection"] ] = [ list(), list() ]
+            doc_ids_for_collection[ item["collection"] ][0].append( item["id_value"] )
+            doc_ids_for_collection[ item["collection"] ][1].append(  count  )
+        for collection in doc_ids_for_collection:
+            doc_ids_for_collection[collection][0] = np.array(doc_ids_for_collection[collection][0])
+            doc_ids_for_collection[collection][1] = np.array(doc_ids_for_collection[collection][1])
+
+        doc_id_to_pos_mapper = {}
+        for collection in doc_ids_for_collection:
+            max_doc_id = np.max(doc_ids_for_collection[collection][0]) 
+            doc_id_to_pos_array = np.ones( int(np.ceil( (max_doc_id +1) / 8 ) * 8), dtype = np.int32 ) * (-1)
+            doc_id_to_pos_array[doc_ids_for_collection[collection][0]] = doc_ids_for_collection[collection][1]
+            doc_id_to_pos_mapper[collection] = doc_id_to_pos_array
+            
+        with open( args.embedding_index_folder + "/" + args.embedding_index_name_prefix + str(shard_number),"wb" ) as f:
+            pickle.dump({
+                "embedding_matrix":embedding_matrix,
+                "doc_id_to_pos_mapper":doc_id_to_pos_mapper,
+                "pos_to_doc_id_mapper":pos_to_doc_id_mapper   
+            }, f, -1)
+    
+        shard_number +=1
+    
diff --git a/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py b/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py
index e8abbb3..96b6e80 100644
--- a/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py
+++ b/backend/1_document_prefetch/src/service_build_index/build_embedding_index.py
@@ -17,6 +17,7 @@
 
 
 ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
+NUM_PROCESSES = int(os.getenv("NUM_PROCESSES"))
 NUM_EMBEDDING_INDEX_SHARDS = int(os.getenv("NUM_EMBEDDING_INDEX_SHARDS"))
 
 SENT2VEC_MODEL_PATH = os.getenv("SENT2VEC_MODEL_PATH")
@@ -29,7 +30,7 @@
     parser.add_argument("-text_encoder_model_path", default = SENT2VEC_MODEL_PATH )
     parser.add_argument("-start", type = int, default = None)
     parser.add_argument("-size", type = int, default = None)
-    parser.add_argument("-n_processes", type = int, default = NUM_EMBEDDING_INDEX_SHARDS )
+    parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES )
     parser.add_argument("-n_docs_per_process", type = int, default = None )
     
     args = parser.parse_args()
@@ -94,6 +95,16 @@
     ## make sure all processes have been finished!
     for t in threads:
         t.join()
+        
+    ## adjust the number of shards for embedding index, as specified by NUM_EMBEDDING_INDEX_SHARDS
+    ## This is needed because for CPU approximate nearest search, we may need more shards, but for GPU brute-force nearest neighbor search, we only need one or two shards, since too many shards will increase the GPU memory overhead.
+    
+    print("Adjusting number of shards for embedding index ...")
+    subprocess.run( ["python", "adjust_num_shards_for_embedding_index.py",
+                     "-embedding_index_folder", ROOT_DATA_PATH + "/ranking_buffer/embedding_index/",
+                     "-embedding_index_name_prefix", "embedding_index.db_",
+                     "-num_shards", str( NUM_EMBEDDING_INDEX_SHARDS )
+                    ] )
     
     print("All Done!")
 
diff --git a/backend/1_document_prefetch/src/service_ranking/service.py b/backend/1_document_prefetch/src/service_ranking/service.py
index c2bdf63..98a529d 100644
--- a/backend/1_document_prefetch/src/service_ranking/service.py
+++ b/backend/1_document_prefetch/src/service_ranking/service.py
@@ -33,6 +33,8 @@
 IS_PRIVATE_SERVER = int( os.getenv("IS_PRIVATE_SERVER") )
 USE_GPU = int( os.getenv("USE_GPU") )
 
+EMBEDDING_INDEX_PRECISION = os.getenv("EMBEDDING_INDEX_PRECISION")
+
 SERVICE_SUFFIX = os.getenv("SERVICE_SUFFIX")
 
 
@@ -282,17 +284,15 @@ def update_inverted_index():
     
     parser.add_argument( "-is_private_server", type = int, default = IS_PRIVATE_SERVER )
     
-    parser.add_argument( "-internal_precision", default = "float32" )
+    parser.add_argument( "-internal_precision", default = EMBEDDING_INDEX_PRECISION )
     parser.add_argument( "-requires_precision_conversion", type = int, default = 1 )
     parser.add_argument( "-num_threads_per_shard", type = int, default = 1 )
     parser.add_argument( "-normalize_query_embedding", type = int, default = 1 )
     
     args = parser.parse_args()
     
-    
     wait_for_service(ADDRESS_SERVICE_BUILD_INDEX)
     
-    
     #Convert folders to absolute path
     args.inverted_index_folder = os.path.abspath( args.inverted_index_folder  )
     args.embedding_index_folder = os.path.abspath( args.embedding_index_folder )
@@ -312,6 +312,9 @@ def update_inverted_index():
     else:
         args.gpu_list = []
         
+    if len(args.gpu_list) == 0:
+        args.internal_precision = "float32"
+        
     mp.set_start_method('spawn')
     
     shard_list = os.listdir(args.embedding_index_folder)
diff --git a/backend/1_helper_functions/docker-compose.yaml b/backend/1_helper_functions/docker-compose.yaml
index 5aa3d90..032c9c9 100644
--- a/backend/1_helper_functions/docker-compose.yaml
+++ b/backend/1_helper_functions/docker-compose.yaml
@@ -11,8 +11,8 @@ services:
         networks:
             - common_network
         hostname: helper_functions_service
-        ports:
-            - 8030:8060
+        # ports:
+        #     - 8030:8060
 networks:
     common_network:
         external: true
\ No newline at end of file
diff --git a/backend/2_fast_metadata_search/docker-compose.yaml b/backend/2_fast_metadata_search/docker-compose.yaml
index ce18867..9161e85 100644
--- a/backend/2_fast_metadata_search/docker-compose.yaml
+++ b/backend/2_fast_metadata_search/docker-compose.yaml
@@ -8,12 +8,12 @@ services:
         image: fast_metadata_search_service
         environment:
             PYTHONUNBUFFERED: 1
-            DUPLICATE_CHECKING_SERVICE_ADDRESS_LIST: http://document_prefetch_service_overall_arxiv:8060/check-duplicate,http://document_prefetch_service_overall_pmcoa:8060/check-duplicate
+            DUPLICATE_CHECKING_SERVICE_ADDRESS_LIST: http://document_prefetch_service_overall_arxiv:8060/check-duplicate,http://document_prefetch_service_overall_pmcoa:8060/check-duplicate,http://document_prefetch_service_overall_s2orc:8060/check-duplicate
         networks:
             - common_network
         hostname: fast_metadata_search_service
-        ports:
-            - 8028:8060
+        # ports:
+        #     - 8028:8060
 networks:
     common_network:
         external: true
\ No newline at end of file
diff --git a/backend/2_paper_database_service/docker-compose.yaml b/backend/2_paper_database_service/docker-compose.yaml
index fb50654..899d499 100644
--- a/backend/2_paper_database_service/docker-compose.yaml
+++ b/backend/2_paper_database_service/docker-compose.yaml
@@ -7,12 +7,12 @@ services:
         build: .
         image: paper_database_service
         environment:
-            SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/get-papers,http://document_prefetch_service_overall_pmcoa:8060/get-papers
+            SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/get-papers,http://document_prefetch_service_overall_pmcoa:8060/get-papers,http://document_prefetch_service_overall_s2orc:8060/get-papers
         networks:
             - common_network
         hostname: paper_database_service
-        ports:
-            - 8023:8060
+        # ports:
+        #     - 8023:8060
 networks:
     common_network:
         external: true
\ No newline at end of file
diff --git a/backend/3_citation_formating/docker-compose.yaml b/backend/3_citation_formating/docker-compose.yaml
index 092bc79..c71f2f2 100644
--- a/backend/3_citation_formating/docker-compose.yaml
+++ b/backend/3_citation_formating/docker-compose.yaml
@@ -12,8 +12,8 @@ services:
         networks:
             - common_network
         hostname: citation_formating_service
-        ports:
-            - 8031:8060
+        # ports:
+        #     - 8031:8060
 networks:
     common_network:
         external: true
\ No newline at end of file
diff --git a/backend/3_citation_generation/docker-compose.yaml b/backend/3_citation_generation/docker-compose.yaml
index 78bb7a5..7778c2a 100644
--- a/backend/3_citation_generation/docker-compose.yaml
+++ b/backend/3_citation_generation/docker-compose.yaml
@@ -15,8 +15,8 @@ services:
         networks:
             - common_network
         hostname: citation_generation_service
-        ports:
-            - 8027:8060
+        # ports:
+        #     - 8027:8060
 networks:
     common_network:
         external: true
diff --git a/backend/3_document_reranking/Dockerfile b/backend/3_document_reranking/Dockerfile
index 4b483e9..03ba017 100644
--- a/backend/3_document_reranking/Dockerfile
+++ b/backend/3_document_reranking/Dockerfile
@@ -10,6 +10,6 @@ RUN python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo
 RUN python -c "from transformers import BertTokenizerFast, BertForNextSentencePrediction; model_path = 'scieditor/document-reranking-scibert'; tokenizer = BertTokenizerFast.from_pretrained(model_path); model = BertForNextSentencePrediction.from_pretrained(model_path)"
 
 WORKDIR /app/src
-COPY . .
+# COPY . .
 
 CMD [ "bash", "run_service.sh" ]
\ No newline at end of file
diff --git a/backend/3_document_reranking/docker-compose.yaml b/backend/3_document_reranking/docker-compose.yaml
index 476dfea..8d1f2f3 100644
--- a/backend/3_document_reranking/docker-compose.yaml
+++ b/backend/3_document_reranking/docker-compose.yaml
@@ -14,8 +14,10 @@ services:
         networks:
             - common_network
         hostname: document_reranking_service
-        ports:
-            - 8024:8060
+        # ports:
+        #     - 8024:8060
+        volumes:
+            - .:/app/src
 networks:
     common_network:
         external: true
diff --git a/backend/3_document_reranking/service.py b/backend/3_document_reranking/service.py
index b97cd4c..80c1d33 100644
--- a/backend/3_document_reranking/service.py
+++ b/backend/3_document_reranking/service.py
@@ -18,9 +18,10 @@
 
 import re
 from nltk.tokenize import RegexpTokenizer
-
+ 
 import nltk
 from nltk.corpus import stopwords
+nltk.download('omw-1.4')
 stopwords_set = set(stopwords.words('english'))
 
 import GPUtil
@@ -124,7 +125,6 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ):
     if ranking_source.strip() == "" and keywords.strip() == "":
         return paper_id_list
     
-    
     print(ranking_source, keywords )
     
     tic = time.time()
@@ -146,10 +146,11 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ):
     """ handle the exact match when users use the title to search for a paper """
     try:
         sorted_paper_id_list, sorted_sims = rerank_based_on_query2section_similarity( paper_id_list, ranking_source, return_similarity = True )
+        
         prefix_papers = []
-        for pos in range( min( len(sorted_paper_id_list), 5 ) ):
+        for pos in range( min( len(sorted_paper_id_list), 10 ) ):
             sorted_paper_id, sim = sorted_paper_id_list[pos], sorted_sims[pos]
-            if sim > 0.98:
+            if sim > 0.9:
                 prefix_papers.append( sorted_paper_id )
     except:
         prefix_papers = []
@@ -157,7 +158,7 @@ def rerank_by_scibert( paper_id_list, ranking_source, keywords ):
     for paper_id in selected_papers_to_be_reranked:
         matched = False
         for pid in prefix_papers:
-            if paper_id == pid:
+            if paper_id == pid: 
                 matched = True
                 break
         if not matched:
@@ -174,7 +175,7 @@ def document_rerank():
     global sem
     
     sem.acquire()
-
+    
     try:
         if not request.json:
             assert False    
@@ -204,7 +205,7 @@ def document_rerank():
         sem.release()
         abort(400)
 
-    sem.release()
+    sem.release()  
 
     return json.dumps(json_out), 201
 
diff --git a/backend/3_extractive_summarization/docker-compose.yaml b/backend/3_extractive_summarization/docker-compose.yaml
index ce63a6f..376693b 100644
--- a/backend/3_extractive_summarization/docker-compose.yaml
+++ b/backend/3_extractive_summarization/docker-compose.yaml
@@ -15,8 +15,8 @@ services:
         networks:
             - common_network
         hostname: extractive_summarization_service
-        ports:
-            - 8026:8060
+        # ports:
+        #     - 8026:8060
 networks:
     common_network:
         external: true
diff --git a/backend/4_document_search_overall/Dockerfile b/backend/4_document_search_overall/Dockerfile
index 4b483e9..03ba017 100644
--- a/backend/4_document_search_overall/Dockerfile
+++ b/backend/4_document_search_overall/Dockerfile
@@ -10,6 +10,6 @@ RUN python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo
 RUN python -c "from transformers import BertTokenizerFast, BertForNextSentencePrediction; model_path = 'scieditor/document-reranking-scibert'; tokenizer = BertTokenizerFast.from_pretrained(model_path); model = BertForNextSentencePrediction.from_pretrained(model_path)"
 
 WORKDIR /app/src
-COPY . .
+# COPY . .
 
 CMD [ "bash", "run_service.sh" ]
\ No newline at end of file
diff --git a/backend/4_document_search_overall/docker-compose.yaml b/backend/4_document_search_overall/docker-compose.yaml
index 18b6e40..2a46fab 100644
--- a/backend/4_document_search_overall/docker-compose.yaml
+++ b/backend/4_document_search_overall/docker-compose.yaml
@@ -4,18 +4,24 @@ version: '3'
 
 services:
     document_search_overall_service: 
+        volumes:
+            - .:/app/src
         build: .
         image: document_search_overall_service
         environment:
             PYTHONUNBUFFERED: 1
             PAPER_DATABASE_SERVICE_ADDRESS: http://paper_database_service:8060/get-papers
             RERANK_SERVICE_ADDRESS: http://document_reranking_service:8060/document-rerank
-            PREFETCH_SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/document-search,http://document_prefetch_service_overall_pmcoa:8060/document-search
+            PREFETCH_SERVICE_ADDRESSES: http://document_prefetch_service_overall_arxiv:8060/document-search,http://document_prefetch_service_overall_s2orc:8060/document-search,http://document_prefetch_service_overall_pmcoa:8060/document-search
+            # ,http://document_prefetch_service_overall_pmcoa:8060/document-search
+            # ,http://document_prefetch_service_overall_s2orc:8060/document-search
         networks:
             - common_network
         hostname: document_search_overall_service
-        ports:
-            - 8025:8060
+        # ports:
+        #     - 8025:8060
 networks:
     common_network:
-        external: true
\ No newline at end of file
+        external: true
+        
+        
\ No newline at end of file
diff --git a/backend/4_document_search_overall/service.py b/backend/4_document_search_overall/service.py
index 155e90d..b6ce4a7 100644
--- a/backend/4_document_search_overall/service.py
+++ b/backend/4_document_search_overall/service.py
@@ -32,15 +32,6 @@ def get_papers( paper_list, projection = None ):
                     headers = {"Content-Type": "application/json", 'Connection': 'close'} ).json()["response"]
     return results
 
-def get_sentence_list_from_parsed( parsed ):
-    sentence_list = []
-    for section in parsed:
-        sentence_list.append(str(section.get( "section_title", "" )))
-        for para in section.get("section_text",[]):
-            for sen in para.get("paragraph_text", []):
-                sentence_list.append( str(sen.get("sentence_text","")) )
-    return sentence_list
-
 """
 current version of request information.
 'ranking_id_value': '', 'ranking_id_field': '', 'ranking_id_type': '', 
@@ -169,30 +160,30 @@ def remove_duplicate( paper_id_list ):
     return [paper_id_list[idx] for idx in  sorted(doc_indices_wo_duplicates) ]
 
 
-def get_section_text_list( paper, top_n_sections = None ):
-    if paper is None:
-        paper = {}
-    title = paper.get("Title","")
-    abstract_parsed = paper.get("Content",{}).get("Abstract_Parsed",[])
-    fullbody_parsed = paper.get("Content",{}).get("Fullbody_Parsed",[])
-    fulltext_parsed = abstract_parsed + fullbody_parsed
-
-    section_text_list = [title]
-    for section in fulltext_parsed:
-        section_text = ""        
+def get_sentence_list_from_parsed( parsed ):
+    sentence_list = []
+    for section in parsed:
+        sentence_list.append(str(section.get( "section_title", "" )))
         for para in section.get("section_text",[]):
-            for sen in para.get("paragraph_text",[]):
-                section_text += sen.get("sentence_text", "") + " "
-        section_text_list.append( section_text )
+            for sen in para.get("paragraph_text", []):
+                sentence_list.append( str(sen.get("sentence_text","")) )
+    return sentence_list
+
+def parse_document( doc_data ):
+    ngram_set = set()
+        
+    ## Title
+    title =  str(doc_data.get("Title", "")).strip()          
+    ## Abstract
+    abstract_sen_list = get_sentence_list_from_parsed(doc_data.get( "Content", {} ).get( "Abstract_Parsed", [] ))
+    ## Fullbody
+    fullbody_sen_list = get_sentence_list_from_parsed(doc_data.get( "Content", {} ).get( "Fullbody_Parsed", [] ))
     
-    if top_n_sections is not None:
-        section_text_list = section_text_list[:top_n_sections]
+    sen_list = [ title ] + abstract_sen_list + fullbody_sen_list
+    ## no need to tokenize here, since it is done internally within sentence ranker
+    doc_text = " ".join( sen_list ) 
     
-    return section_text_list
-
-def get_doc_text( paper ):
-    section_text_list = get_section_text_list(paper)
-    return " ".join(section_text_list)
+    return doc_text
 
 def rank_based_on_query_to_doc_similarity( paper_id_list, ranking_source, nResults = None ):
     global sentence_ranker
@@ -202,14 +193,14 @@ def rank_based_on_query_to_doc_similarity( paper_id_list, ranking_source, nResul
     
     tic = time.time()
     
-    paper_content_list = get_papers( paper_id_list, { "Title":1, "Content.Abstract_Parsed":1, "Content.Fullbody_Parsed": 1 } )
+    paper_content_list = get_papers( paper_id_list )
     if len(paper_content_list) != len(paper_id_list):
         return paper_id_list
     
     print( "load paper time:", time.time() - tic )
     
     try: 
-        doc_text_list = [ get_doc_text( paper_content ) for paper_content in paper_content_list ]
+        doc_text_list = [ parse_document( paper_content ) for paper_content in paper_content_list ]
         _,  doc_indices = sentence_ranker.rank_sentences( ranking_source, doc_text_list )
         
         selected_papers_to_be_reranked = [ paper_id_list[idx] for idx in doc_indices ]
@@ -267,6 +258,13 @@ def document_search():
         requires_reranking = request_info.get( "requires_reranking", True )
         reranking_method = request_info.get( "reranking_method", "scibert" )
         
+        
+        requires_removing_duplicates = True
+        requires_additional_prefetching = True
+        requires_reranking = True
+        reranking_method = "scibert"
+        
+        
         ## prefetch results from a list of prefetching document search servers        
         prefetched_paper_id_list, nMatchingDocuments = prefetch(
                     ranking_source + " " + keywords.replace( "<OR>", " " ).replace( "<NOT>", " " ).replace( "<AND>", " " ),
@@ -299,6 +297,7 @@ def document_search():
         abort(400)
 
     sem.release()
+    
     return json.dumps(json_out), 201
 
 
diff --git a/backend/5_title_generic_search/docker-compose.yaml b/backend/5_title_generic_search/docker-compose.yaml
index 98e54a8..1b60cd6 100644
--- a/backend/5_title_generic_search/docker-compose.yaml
+++ b/backend/5_title_generic_search/docker-compose.yaml
@@ -14,8 +14,8 @@ services:
         networks:
             - common_network
         hostname: title_generic_search_service
-        ports:
-            - 8029:8060
+        # ports:
+        #     - 8029:8060
 networks:
     common_network:
         external: true
\ No newline at end of file
diff --git a/backend/Documentation of Microservices.ipynb b/backend/Documentation of Microservices.ipynb
index c2536cc..c574627 100644
--- a/backend/Documentation of Microservices.ipynb	
+++ b/backend/Documentation of Microservices.ipynb	
@@ -317,17 +317,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "id": "516e216d-1db8-4489-b948-e32b0a281289",
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'query_id': '7368e3be-1890-441e-a796-2ce04c9ec969', 'response': [{'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 798}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 499}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 664}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 251}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 38}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 722}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 214}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 504}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1011}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 638}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 346}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 666}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 905}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 237}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 708}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 712}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 340}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1007}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 787}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 535}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 404}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 700}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 461}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 784}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 212}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 12}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 143}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 640}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 292}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 843}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 44}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 557}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 547}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 813}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 450}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 822}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 23}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 752}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 339}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 750}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 530}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 786}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 251}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 781}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1027}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 502}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 437}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 639}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 77}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 704}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 331}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 870}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 993}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 641}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1006}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 732}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 645}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 324}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 680}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 89}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 808}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 32}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 966}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 554}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 505}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 5}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 815}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 190}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 391}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 359}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 740}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 768}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 466}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 624}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 97}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 337}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 869}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 589}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 246}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 181}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 725}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 858}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 847}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1020}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 1019}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 855}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 67}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 501}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 650}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 898}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 174}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 586}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 77}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 979}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 307}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 394}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 670}, {'collection': 'PMCOA', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 986}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 112}, {'collection': 'arXiv', 'id_field': 'id_int', 'id_type': 'int', 'id_value': 610}], 'search_stats': {'DurationTotalSearch': 9200, 'nMatchingDocuments': 143}}\n"
+     "ename": "JSONDecodeError",
+     "evalue": "[Errno Expecting value] <!doctype html>\n<html lang=en>\n<title>400 Bad Request</title>\n<h1>Bad Request</h1>\n<p>The browser (or proxy) sent a request that this server could not understand.</p>\n: 0",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
+      "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    909\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 910\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mcomplexjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    911\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mJSONDecodeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m    347\u001b[0m             parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 348\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    349\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m    336\u001b[0m         \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m         \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    338\u001b[0m         \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m    354\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    356\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_2116499/1365860442.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m output = requests.post( api_gateway_address+\"/ml-api/doc-search/v1.0\", \n\u001b[1;32m      8\u001b[0m                                       \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m  \u001b[0mquery_data\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m                                       headers = {\"Content-Type\": \"application/json\"} ).json()\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/scieditor/lib/python3.7/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    915\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    916\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 917\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    919\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mJSONDecodeError\u001b[0m: [Errno Expecting value] <!doctype html>\n<html lang=en>\n<title>400 Bad Request</title>\n<h1>Bad Request</h1>\n<p>The browser (or proxy) sent a request that this server could not understand.</p>\n: 0"
      ]
     }
    ],
@@ -995,7 +1007,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1009,7 +1021,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.12"
+   "version": "3.7.13"
   }
  },
  "nbformat": 4,