update

nianlonggu · Jul 28, 2023 · d91dc79 · d91dc79
1 parent 1fa9cc1
commit d91dc79
Show file tree

Hide file tree

Showing 36 changed files with 1,074 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -115,7 +115,32 @@ node -v
 ```
 
 ## Prepare Raw Corpus
-Here we demonstrated building the search engine on papers from PubMed Open Access (PMCOA) and arXiv.
+Here we demonstrated building the search engine on papers from S2ORC, PubMed Open Access (PMCOA) and arXiv.
+### S2ORC
+S2ORC can be accessed via [semantic scholar api](https://www.semanticscholar.org/product/api). Here we only download a tiny subset to demonstrate the pipeline. 
+
+```bash
+mkdir -p backend/1_document_prefetch/data/S2ORC/raw/metadata
+mkdir -p backend/1_document_prefetch/data/S2ORC/raw/pdf_parses
+wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_0.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/metadata https://huggingface.co/scieditor/example_data_S2ORC/raw/main/metadata/metadata_1.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_0.jsonl
+wget -P backend/1_document_prefetch/data/S2ORC/raw/pdf_parses https://huggingface.co/scieditor/example_data_S2ORC/resolve/main/pdf_parses/pdf_parses_1.jsonl
+
+```
+This is how the files are organized:
+```
+backend/1_document_prefetch/data/S2ORC/
+└── raw
+    ├── metadata
+    │   ├── metadata_0.jsonl
+    │   └── metadata_1.jsonl
+    └── pdf_parses
+        ├── pdf_parses_0.jsonl
+        └── pdf_parses_1.jsonl
+```
+
+
 ### PMCOA
 We can download the .tar.gz files from the official FTP service https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/, and put the files into the folder:
 ```
@@ -208,6 +233,7 @@ cd $BASE_DIR/backend/5_title_generic_search && docker-compose up --build -d
 cd $BASE_DIR/backend/final_api_gateway && docker-compose up --build -d
 
 cd $BASE_DIR
+
 ```
 
 By default, port 8060 is used by the final API gateway to communicate with the frontend or API developers.

diff --git a/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml b/backend/1_document_prefetch/docker-compose-build-database-PMCOA.yaml
@@ -12,5 +12,6 @@ services:
         image: document_prefetch_build_database_pmcoa
         environment:
             NUM_PROCESSES: 100
+            COLLECTION: PMCOA
         volumes:
             - $PWD/data/PMCOA:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml b/backend/1_document_prefetch/docker-compose-build-database-S2ORC.yaml
@@ -0,0 +1,17 @@
+version: '3'
+
+services:
+
+    paper_database_manager:
+        build: ./src/modules/paper_database
+        image: paper_database_manager
+        command: ["echo","hello"]
+
+    document_prefetch_build_database_s2orc:
+        build: ./src/build_database/S2ORC
+        image: document_prefetch_build_database_s2orc
+        environment:
+            NUM_PROCESSES: 100
+            COLLECTION: S2ORC
+        volumes:
+            - $PWD/data/S2ORC:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml b/backend/1_document_prefetch/docker-compose-build-database-arXiv.yaml
@@ -17,5 +17,6 @@ services:
         image: document_prefetch_build_database_arxiv
         environment:
             NUM_PROCESSES: 100
+            COLLECTION: arXiv
         volumes:
             - $PWD/data/arXiv:/app/data
diff --git a/backend/1_document_prefetch/docker-compose-document-prefetch.yaml b/backend/1_document_prefetch/docker-compose-document-prefetch.yaml
@@ -2,9 +2,11 @@ version: '3'
 
 ### environment variables
 # NUM_PROCESSES
+# NUM_EMBEDDING_INDEX_SHARDS
 # DATA_PATH
 # PRIVATE
 # USE_GPU
+# EMBEDDING_INDEX_PRECISION
 # SERVICE_SUFFIX
 
 services:
@@ -21,7 +23,8 @@ services:
         depends_on:
             - document_prefetch_base
         environment:
-            NUM_EMBEDDING_INDEX_SHARDS: ${NUM_PROCESSES}
+            NUM_PROCESSES: ${NUM_PROCESSES}
+            NUM_EMBEDDING_INDEX_SHARDS: ${NUM_EMBEDDING_INDEX_SHARDS}
             NUM_INVERTED_INDEX_SHARDS: 10
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
         volumes:
@@ -40,6 +43,7 @@ services:
             NVIDIA_VISIBLE_DEVICES: all
             IS_PRIVATE_SERVER: ${PRIVATE}
             USE_GPU: ${USE_GPU}
+            EMBEDDING_INDEX_PRECISION: ${EMBEDDING_INDEX_PRECISION}
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
         volumes:
             - ${DATA_PATH}:/app/data
@@ -89,8 +93,8 @@ services:
         image: document_prefetch_service_overall
         environment:
             SERVICE_SUFFIX: ${SERVICE_SUFFIX}
-        ports:
-            - ${PORT}:8060
+        # ports:
+        #     - ${PORT}:8060
         networks:
             - common_network
         hostname: document_prefetch_service_overall_${SERVICE_SUFFIX}

diff --git a/backend/1_document_prefetch/script_build_all_databases.sh b/backend/1_document_prefetch/script_build_all_databases.sh
@@ -4,3 +4,5 @@ docker-compose -f docker-compose-build-database-arXiv.yaml up --build
 
 docker-compose -f docker-compose-build-database-PMCOA.yaml up --build 
 
+docker-compose -f docker-compose-build-database-S2ORC.yaml up --build 
+
diff --git a/backend/1_document_prefetch/script_start_all_services.sh b/backend/1_document_prefetch/script_start_all_services.sh
@@ -1,9 +1,18 @@
 #!/bin/bash
 
+####### 
+# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores
+# NUM_EMBEDDING_INDEX_SHARDS:  Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs
+# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32  . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically
+
 
 #### prefetch server on arXiv
-DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8021 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  up --build -d
+DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  up --build -d
 
 
 #### prefetch server on PMCOA
-DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 PRIVATE=0 USE_GPU=0 PORT=8022 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  up --build -d
+DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4  SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  up --build -d
+
+
+#### prefetch server on S2ORC
+DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4  SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc  up --build -d
diff --git a/backend/1_document_prefetch/script_stop_all_services.sh b/backend/1_document_prefetch/script_stop_all_services.sh
@@ -0,0 +1,3 @@
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv  down
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa  down
+docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc  down
diff --git a/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile b/backend/1_document_prefetch/src/build_database/PMCOA/Dockerfile
@@ -1,9 +1,6 @@
 FROM paper_database_manager as base
 
-ENV COLLECTION="PMCOA"
 ENV ROOT_DATA_PATH=/app/data
-## Here setting the default number of processes to 16, and this can be overwritten when calling docker run by setting -e (or --env)
-ENV NUM_PROCESSES=16
 
 SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
 

diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile b/backend/1_document_prefetch/src/build_database/S2ORC/Dockerfile
@@ -0,0 +1,14 @@
+FROM paper_database_manager as base
+
+ENV ROOT_DATA_PATH=/app/data
+
+SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
+
+WORKDIR /app/src
+COPY . .
+
+RUN pip install -r requirements.txt
+
+## Note: when calling docker run, one must map the host machine's volume to /app/data
+## The host volume is expected to contain all the data needed for the search engine
+CMD [ "bash", "run.sh" ]
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_normalized_sqlite_database.py
@@ -0,0 +1,124 @@
+import subprocess
+import threading
+from tqdm import tqdm
+import os
+import numpy as np
+from raw_sqlite_utils import SqliteClient as RawSqliteClient
+import time
+import json
+from modules.paper_database.database_managers import SqliteClient
+import shutil
+
+
+# import os,sys,inspect
+# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+# root_dir = os.path.dirname(os.path.dirname(current_dir))
+# sys.path.insert(0, root_dir)
+# sys.path.insert(0, current_dir)
+
+
+import argparse
+
+
+### get all needed environment variables
+ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
+COLLECTION = os.getenv("COLLECTION")
+NUM_PROCESSES = int(os.getenv("NUM_PROCESSES"))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-metadata_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db" )
+    parser.add_argument("-pdf_parses_raw_sql_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db" )
+    parser.add_argument("-json_schema_path", default = "json_schema.json" )
+    parser.add_argument("-output_file_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/normalized_data.jsonl" )
+    parser.add_argument("-start", type = int, default = None )
+    parser.add_argument("-size", type =int, default = None )
+    parser.add_argument("-collection", default = COLLECTION )
+    parser.add_argument("-batch_size", type = int, default = 5000 )
+    parser.add_argument("-n_processes", type = int, default = NUM_PROCESSES )
+    parser.add_argument("-output_sqlite_database_name", default = ROOT_DATA_PATH + "/sqlite_database_buffer/DB.db" )
+
+    args = parser.parse_args()
+
+    metadata_sql = RawSqliteClient( args.metadata_raw_sql_path )
+    if args.start is None or args.size is None:
+        print("No proper start and size value are specified, processing the whole document ...")
+        print("Counting the total number of examples ...")
+        args.start = 0
+        args.size = metadata_sql.get_max_rowid(args.collection)
+    else:
+        try:
+            assert args.start is not None and args.size is not None
+            assert args.start >= 0 and args.size >= 0
+        except:
+            print("Error: Wrong start and size value were provided!")
+            os.sys.exit(1)
+
+    output_folder = os.path.dirname( args.output_file_name )
+    try:
+        shutil.rmtree( output_folder )
+    except:
+        pass
+    os.makedirs( output_folder )
+
+    output_sqlite_database_folder = os.path.dirname( args.output_sqlite_database_name )
+    try:
+        shutil.rmtree( output_sqlite_database_folder )
+    except:
+        pass
+    os.makedirs( output_sqlite_database_folder )
+
+    num_of_examples_per_process = int( np.ceil( args.size / args.n_processes ) )
+    print("Start multiple subprocesses ...")
+
+    threads = []
+    for offset in range( args.start, args.start + args.size, num_of_examples_per_process ):
+        t = threading.Thread( target = subprocess.run, args = ( 
+                list(map( str, [
+                    "python",
+                    "normalize_raw_sqlite.py",
+                    "-metadata_raw_sql_path", args.metadata_raw_sql_path,
+                    "-pdf_parses_raw_sql_path", args.pdf_parses_raw_sql_path,
+                    "-json_schema_path", args.json_schema_path ,
+                    "-output_file_name", args.output_file_name,
+                    "-output_file_name_suffix", "_%d"%( offset ),
+                    "-start", offset,
+                    "-size", min(num_of_examples_per_process, args.start + args.size -  offset ),
+                    "-collection", args.collection,
+                    "-batch_size", args.batch_size
+                   ] ) ) ,
+             )  )
+        threads.append(t)
+        t.start()
+    for t in threads:
+        t.join()
+
+
+    print("Dumping to the final sqlite database, this may take time ...")
+
+    final_sql = SqliteClient( args.output_sqlite_database_name )
+
+    output_base_name = os.path.basename( args.output_file_name )
+    flist =[ output_folder +"/"+fname for fname in os.listdir( output_folder ) if fname.startswith(output_base_name+"_") ]
+    flist.sort( key = lambda x:int(x.split("_")[-1]) )
+
+    paper_buffer = []
+    for fname in flist:
+        print(fname)
+        with open( fname ,"r" ) as f:
+            for line in f:
+                line_data = json.loads(line)
+                paper_buffer.append(line_data)
+
+                if len(paper_buffer) >= args.batch_size:
+                    final_sql.insert_papers( paper_buffer, args.collection )
+                    paper_buffer = []
+        os.remove( fname )
+
+    if len(paper_buffer)>0:
+        final_sql.insert_papers( paper_buffer, args.collection )
+        paper_buffer = []
+
+    print("All Done!")
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py b/backend/1_document_prefetch/src/build_database/S2ORC/build_raw_sqlite_database.py
@@ -0,0 +1,58 @@
+from raw_sqlite_utils import SqliteClient
+import numpy as np
+import json
+import time
+import os
+from tqdm import tqdm
+import re
+import argparse
+
+def dump_to_sqlite( folder, db_path, buffer_size, paper_id_matcher, collection ):
+    db_path_dir_name = os.path.dirname(db_path)
+    if not os.path.exists( db_path_dir_name ):
+        os.makedirs( db_path_dir_name )
+
+    flist = [folder + "/" + _ for _ in  os.listdir(folder) if _.endswith(".jsonl") ]
+    flist.sort( key = lambda x:int( x.split("_")[-1].split(".")[0] ) )
+
+    sql_client = SqliteClient(db_path)
+
+    paper_list_buffer = []
+    for fname in flist:
+        print(fname)
+        with open( fname,"r" ) as f:
+            for line in tqdm(f):                
+                paper_id = int(paper_id_matcher.findall(line[:50])[0])
+                paper_list_buffer.append( { "paper_id":paper_id,"Text":line } )
+                if len(paper_list_buffer) >= buffer_size:
+                    sql_client.insert_papers( collection, paper_list_buffer )
+                    paper_list_buffer = []
+    if len( paper_list_buffer ) > 0:
+        sql_client.insert_papers( collection, paper_list_buffer )
+        paper_list_buffer = []
+
+ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
+COLLECTION = os.getenv("COLLECTION")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-metadata_jsonl_folder", default = ROOT_DATA_PATH + "/raw/metadata/" )
+    parser.add_argument("-metadata_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/metadata.db")
+    parser.add_argument("-pdf_parses_jsonl_folder", default = ROOT_DATA_PATH + "/raw/pdf_parses/")
+    parser.add_argument("-pdf_parses_db_path", default = ROOT_DATA_PATH + "/raw/sqliteDB/pdf_parses.db")
+    parser.add_argument("-buffer_size", type = int, default = 1000 )
+    parser.add_argument("-collection", default = COLLECTION)
+    args = parser.parse_args()
+
+
+    paper_id_matcher = re.compile('(?<="paper_id": ")\d*(?=")') 
+
+    print("Converting metadata raw jsonl files to a single metadata sqlite ...")
+    dump_to_sqlite( args.metadata_jsonl_folder, args.metadata_db_path, args.buffer_size, paper_id_matcher, args.collection )
+
+    print("Converting pdf_parses raw jsonl files to a single metadata sqlite ...")
+    dump_to_sqlite( args.pdf_parses_jsonl_folder, args.pdf_parses_db_path, args.buffer_size, paper_id_matcher, args.collection )
+
+    print("All Done!")
+
diff --git a/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json b/backend/1_document_prefetch/src/build_database/S2ORC/json_schema.json
@@ -0,0 +1 @@
+{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } }  }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } }   }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,3 +4,5 @@ docker-compose -f docker-compose-build-database-arXiv.yaml up --build

		docker-compose -f docker-compose-build-database-PMCOA.yaml up --build

		docker-compose -f docker-compose-build-database-S2ORC.yaml up --build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]}