Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
nianlonggu committed Jul 28, 2023
1 parent d91dc79 commit cd40f01
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 11 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@ LIVE DEMO: https://scilit.vercel.app/
PAPER: https://aclanthology.org/2023.acl-demo.22.pdf

![](frontend/screenshots/frontend.png)

## System Requirements
* OS: Ubuntu 22.04 LTS or Debian 10, with one 16 GB GPU for supporting NLP functions.
* Storage: >= 100 GB
* RAM:
- 500 GB for 150 million papers


## Installation
### Install Docker
This step is NOT needed if Docker engine has already been installed.
Expand Down
8 changes: 5 additions & 3 deletions backend/1_document_prefetch/script_start_all_services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@

#######
# NUM_PROCESSES: number process that is used to build the inverted index and embedding index. To fully utilize CPU cores on a large corpus like S2ORC, set NUM_PROCESSES to 2 times of the number of CPU cores

# NUM_EMBEDDING_INDEX_SHARDS: Number of shardded embedding index files. When using CPU-approximate nearest neighbor search (USE_GPU=0), set NUM_EMBEDDING_INDEX_SHARDS to a large value (e.g., 2 times of the number of CPU cores). When using GPU brute-force nearest neighbor search (USE_GPU=1), set NUM_EMBEDDING_INDEX_SHARDS to the number of available GPUs

# EMBEDDING_INDEX_PRECISION: used for low-precision GPU BFNN. Available choices: bool, int4, int8, float32 . When USE_GPU=0 or no GPU is available, EMBEDDING_INDEX_PRECISION will be switched to float32 automatically


#### prefetch server on arXiv
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d
DATA_PATH=$PWD/data/arXiv NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=20 PRIVATE=0 USE_GPU=0 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=arxiv docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_arxiv up --build -d


#### prefetch server on PMCOA
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d
DATA_PATH=$PWD/data/PMCOA NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=20 PRIVATE=0 USE_GPU=0 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=pmcoa docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_pmcoa up --build -d


#### prefetch server on S2ORC
DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=1 PRIVATE=0 USE_GPU=1 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc up --build -d
DATA_PATH=$PWD/data/S2ORC NUM_PROCESSES=10 NUM_EMBEDDING_INDEX_SHARDS=20 PRIVATE=0 USE_GPU=0 EMBEDDING_INDEX_PRECISION=int4 SERVICE_SUFFIX=s2orc docker-compose -f docker-compose-document-prefetch.yaml -p document_prefetch_s2orc up --build -d
16 changes: 15 additions & 1 deletion backend/1_document_prefetch/src/service_build_index/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

ROOT_DATA_PATH = os.getenv("ROOT_DATA_PATH")
SERVICE_SUFFIX = os.getenv("SERVICE_SUFFIX")
NUM_EMBEDDING_INDEX_SHARDS = int(os.getenv("NUM_EMBEDDING_INDEX_SHARDS"))


ADDRESS_SERVICE_PAPER_DATABASE = f"http://document_prefetch_service_paper_database_{SERVICE_SUFFIX}:8060"
ADDRESS_SERVICE_RANKING = f"http://document_prefetch_service_ranking_{SERVICE_SUFFIX}:8060"
Expand Down Expand Up @@ -154,6 +156,14 @@ def build_index_pipeline( reboot_services_after_building = True ):
headers = {"Content-Type":"application/json"} ).json()["response"])
except:
print("fail")


def adjust_num_shards_for_embedding_index():
subprocess.run( ["python", "adjust_num_shards_for_embedding_index.py",
"-embedding_index_folder", ROOT_DATA_PATH + "/ranking/embedding_index/",
"-embedding_index_name_prefix", "embedding_index.db_",
"-num_shards", str( NUM_EMBEDDING_INDEX_SHARDS )
] )


@app.route('/build-index', methods=['POST'])
Expand Down Expand Up @@ -194,7 +204,11 @@ def build_index():

## In this case, the service for paper database, ranking and duplicate checking cannot be running, so we cannot signal them to reboot by sending http requests. Therefore, we set reboot_services_after_building to False
build_index_pipeline(reboot_services_after_building = False)


if len( glob( ROOT_DATA_PATH + "/ranking/embedding_index/embedding_index.db*" ) ) != NUM_EMBEDDING_INDEX_SHARDS:
print("Specified number of embedding index has changed, adjusting the number of shards ...")
adjust_num_shards_for_embedding_index()


print("\n\nWaiting for requests...")
sem = threading.Semaphore()
Expand Down

0 comments on commit cd40f01

Please sign in to comment.