From 236b386ddc11d292b4b736162b59488a02236d6c Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Fri, 3 Jun 2022 12:13:48 -0400 Subject: [PATCH] Add --download option to run_regression.py (#1893) Option automatically downloads the corpus from our servers. --- README.md | 16 +-- ...ssions-dl19-doc-segmented-unicoil-noexp.md | 20 ++-- .../regressions-dl19-doc-segmented-unicoil.md | 20 ++-- .../regressions-dl19-passage-unicoil-noexp.md | 20 ++-- docs/regressions-dl19-passage-unicoil.md | 20 ++-- ...ssions-dl20-doc-segmented-unicoil-noexp.md | 20 ++-- .../regressions-dl20-doc-segmented-unicoil.md | 20 ++-- .../regressions-dl20-passage-unicoil-noexp.md | 20 ++-- docs/regressions-dl20-passage-unicoil.md | 20 ++-- ...ons-dl21-doc-segmented-unicoil-0shot-v2.md | 31 +++-- ...ssions-dl21-doc-segmented-unicoil-0shot.md | 29 ++++- ...21-doc-segmented-unicoil-noexp-0shot-v2.md | 31 +++-- ...-dl21-doc-segmented-unicoil-noexp-0shot.md | 29 ++++- .../regressions-dl21-passage-unicoil-0shot.md | 29 ++++- ...ssions-dl21-passage-unicoil-noexp-0shot.md | 23 +++- ...ons-msmarco-doc-segmented-unicoil-noexp.md | 20 ++-- ...gressions-msmarco-doc-segmented-unicoil.md | 20 ++-- ...gressions-msmarco-passage-unicoil-noexp.md | 20 ++-- docs/regressions-msmarco-passage-unicoil.md | 20 ++-- ...marco-v2-doc-segmented-unicoil-0shot-v2.md | 29 ++++- ...-msmarco-v2-doc-segmented-unicoil-0shot.md | 29 ++++- ...v2-doc-segmented-unicoil-noexp-0shot-v2.md | 29 ++++- ...co-v2-doc-segmented-unicoil-noexp-0shot.md | 29 ++++- ...ssions-msmarco-v2-passage-unicoil-0shot.md | 29 ++++- ...-msmarco-v2-passage-unicoil-noexp-0shot.md | 29 ++++- src/main/python/run_regression.py | 106 +++++++++++++++++- .../dl19-doc-segmented-unicoil-noexp.template | 20 ++-- .../dl19-doc-segmented-unicoil.template | 20 ++-- .../dl19-passage-unicoil-noexp.template | 20 ++-- .../templates/dl19-passage-unicoil.template | 20 ++-- .../dl20-doc-segmented-unicoil-noexp.template | 20 ++-- .../dl20-doc-segmented-unicoil.template | 20 ++-- .../dl20-passage-unicoil-noexp.template | 20 ++-- .../templates/dl20-passage-unicoil.template | 20 ++-- ...21-doc-segmented-unicoil-0shot-v2.template | 31 +++-- .../dl21-doc-segmented-unicoil-0shot.template | 29 ++++- ...-segmented-unicoil-noexp-0shot-v2.template | 31 +++-- ...doc-segmented-unicoil-noexp-0shot.template | 29 ++++- .../dl21-passage-unicoil-0shot.template | 29 ++++- .../dl21-passage-unicoil-noexp-0shot.template | 23 +++- ...marco-doc-segmented-unicoil-noexp.template | 20 ++-- .../msmarco-doc-segmented-unicoil.template | 20 ++-- .../msmarco-passage-unicoil-noexp.template | 20 ++-- .../msmarco-passage-unicoil.template | 20 ++-- ...v2-doc-segmented-unicoil-0shot-v2.template | 29 ++++- ...co-v2-doc-segmented-unicoil-0shot.template | 29 ++++- ...-segmented-unicoil-noexp-0shot-v2.template | 29 ++++- ...doc-segmented-unicoil-noexp-0shot.template | 29 ++++- .../msmarco-v2-passage-unicoil-0shot.template | 29 ++++- ...co-v2-passage-unicoil-noexp-0shot.template | 29 ++++- .../dl19-doc-segmented-unicoil-noexp.yaml | 3 + .../dl19-doc-segmented-unicoil.yaml | 3 + .../dl19-passage-unicoil-noexp.yaml | 3 + .../regression/dl19-passage-unicoil.yaml | 3 + .../dl20-doc-segmented-unicoil-noexp.yaml | 3 + .../dl20-doc-segmented-unicoil.yaml | 3 + .../dl20-passage-unicoil-noexp.yaml | 3 + .../regression/dl20-passage-unicoil.yaml | 3 + .../dl21-doc-segmented-unicoil-0shot-v2.yaml | 4 + .../dl21-doc-segmented-unicoil-0shot.yaml | 4 + ...-doc-segmented-unicoil-noexp-0shot-v2.yaml | 4 + ...l21-doc-segmented-unicoil-noexp-0shot.yaml | 4 + .../dl21-passage-unicoil-0shot.yaml | 4 + .../dl21-passage-unicoil-noexp-0shot.yaml | 4 + .../msmarco-doc-segmented-unicoil-noexp.yaml | 3 + .../msmarco-doc-segmented-unicoil.yaml | 3 + .../msmarco-passage-unicoil-noexp.yaml | 3 + .../regression/msmarco-passage-unicoil.yaml | 3 + ...rco-v2-doc-segmented-unicoil-0shot-v2.yaml | 4 + ...smarco-v2-doc-segmented-unicoil-0shot.yaml | 4 + ...-doc-segmented-unicoil-noexp-0shot-v2.yaml | 4 + ...-v2-doc-segmented-unicoil-noexp-0shot.yaml | 4 + .../msmarco-v2-passage-unicoil-0shot.yaml | 4 + ...smarco-v2-passage-unicoil-noexp-0shot.yaml | 4 + src/test/java/io/anserini/doc/DataModel.java | 28 +++++ 75 files changed, 1060 insertions(+), 346 deletions(-) diff --git a/README.md b/README.md index 36f12eba5a..09292558c6 100644 --- a/README.md +++ b/README.md @@ -62,8 +62,8 @@ See individual pages for details! | doc2query | [+](docs/regressions-msmarco-passage-doc2query.md) | | doc2query-T5 | [+](docs/regressions-msmarco-passage-docTTTTTquery.md) | [+](docs/regressions-dl19-passage-docTTTTTquery.md) | [+](docs/regressions-dl20-passage-docTTTTTquery.md) | | **Learned sparse lexical (uniCOIL family)** | -| uniCOIL noexp | [+](docs/regressions-msmarco-passage-unicoil-noexp.md) | [+](docs/regressions-dl19-passage-unicoil-noexp.md) | [+](docs/regressions-dl20-passage-unicoil-noexp.md) | -| uniCOIL with doc2query-T5 | [+](docs/regressions-msmarco-passage-unicoil.md) | [+](docs/regressions-dl19-passage-unicoil.md) | [+](docs/regressions-dl20-passage-unicoil.md) | +| uniCOIL noexp | [✓](docs/regressions-msmarco-passage-unicoil-noexp.md) | [✓](docs/regressions-dl19-passage-unicoil-noexp.md) | [✓](docs/regressions-dl20-passage-unicoil-noexp.md) | +| uniCOIL with doc2query-T5 | [✓](docs/regressions-msmarco-passage-unicoil.md) | [✓](docs/regressions-dl19-passage-unicoil.md) | [✓](docs/regressions-dl20-passage-unicoil.md) | | uniCOIL with TILDE | [+](docs/regressions-msmarco-passage-unicoil-tilde-expansion.md) | | **Learned sparse lexical (other)** | | DeepImpact | [+](docs/regressions-msmarco-passage-deepimpact.md) | @@ -83,8 +83,8 @@ See individual pages for details! | WP baselines | [+](docs/regressions-msmarco-doc-segmented-wp.md) | [+](docs/regressions-dl19-doc-segmented-wp.md) | [+](docs/regressions-dl20-doc-segmented-wp.md) | | doc2query-T5 | [+](docs/regressions-msmarco-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl19-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl20-doc-segmented-docTTTTTquery.md) | | **Learned sparse lexical** | -| uniCOIL noexp | [+](docs/regressions-msmarco-doc-segmented-unicoil-noexp.md) | [+](docs/regressions-dl19-doc-segmented-unicoil-noexp.md) | [+](docs/regressions-dl20-doc-segmented-unicoil-noexp.md) | -| uniCOIL with doc2query-T5 | [+](docs/regressions-msmarco-doc-segmented-unicoil.md) | [+](docs/regressions-dl19-doc-segmented-unicoil.md) | [+](docs/regressions-dl20-doc-segmented-unicoil.md) | +| uniCOIL noexp | [✓](docs/regressions-msmarco-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil-noexp.md) | +| uniCOIL with doc2query-T5 | [✓](docs/regressions-msmarco-doc-segmented-unicoil.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil.md) | ### MS MARCO V2 Passage Corpus @@ -97,8 +97,8 @@ See individual pages for details! | baselines | [+](docs/regressions-msmarco-v2-passage-augmented.md) | [+](docs/regressions-dl21-passage-augmented.md) | | doc2query-T5 | [+](docs/regressions-msmarco-v2-passage-augmented-d2q-t5.md) | [+](docs/regressions-dl21-passage-augmented-d2q-t5.md) | | **Learned sparse lexical** | -| uniCOIL noexp zero-shot | [+](docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md) | [+](docs/regressions-dl21-passage-unicoil-noexp-0shot.md) | -| uniCOIL with doc2query-T5 zero-shot | [+](docs/regressions-msmarco-v2-passage-unicoil-0shot.md) | [+](docs/regressions-dl21-passage-unicoil-0shot.md) | +| uniCOIL noexp zero-shot | [✓](docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md) | [✓](docs/regressions-dl21-passage-unicoil-noexp-0shot.md) | +| uniCOIL with doc2query-T5 zero-shot | [✓](docs/regressions-msmarco-v2-passage-unicoil-0shot.md) | [✓](docs/regressions-dl21-passage-unicoil-0shot.md) | ### MS MARCO V2 Document Corpus @@ -111,8 +111,8 @@ See individual pages for details! | baselines | [+](docs/regressions-msmarco-v2-doc-segmented.md) | [+](docs/regressions-dl21-doc-segmented.md) | | doc2query-T5 | [+](docs/regressions-msmarco-v2-doc-segmented-d2q-t5.md) | [+](docs/regressions-dl21-doc-segmented-d2q-t5.md) | | **Learned sparse lexical** | -| uniCOIL noexp zero-shot | [+](docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md) | [+](docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md) | -| uniCOIL with doc2query-T5 zero-shot | [+](docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md) | [+](docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md) | +| uniCOIL noexp zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md) | +| uniCOIL with doc2query-T5 zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md) | ### Regressions for BEIR (v1.0.0) diff --git a/docs/regressions-dl19-doc-segmented-unicoil-noexp.md b/docs/regressions-dl19-doc-segmented-unicoil-noexp.md index 1771f3e51c..26b25d10e5 100644 --- a/docs/regressions-dl19-doc-segmented-unicoil-noexp.md +++ b/docs/regressions-dl19-doc-segmented-unicoil-noexp.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-segmented-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-doc-segmented-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-segmented-unicoil-noexp \ --corpus-path collections/msmarco-doc-segmented-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl19-doc-segmented-unicoil.md b/docs/regressions-dl19-doc-segmented-unicoil.md index 1bc3d1055e..ccbdd8860f 100644 --- a/docs/regressions-dl19-doc-segmented-unicoil.md +++ b/docs/regressions-dl19-doc-segmented-unicoil.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-segmented-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-doc-segmented-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-segmented-unicoil \ --corpus-path collections/msmarco-doc-segmented-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl19-passage-unicoil-noexp.md b/docs/regressions-dl19-passage-unicoil-noexp.md index 9eabd6ea04..3d5ac98053 100644 --- a/docs/regressions-dl19-passage-unicoil-noexp.md +++ b/docs/regressions-dl19-passage-unicoil-noexp.md @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-unicoil-noexp \ --corpus-path collections/msmarco-passage-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl19-passage-unicoil.md b/docs/regressions-dl19-passage-unicoil.md index 1a650e63a5..66f2292f8b 100644 --- a/docs/regressions-dl19-passage-unicoil.md +++ b/docs/regressions-dl19-passage-unicoil.md @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-unicoil \ --corpus-path collections/msmarco-passage-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl20-doc-segmented-unicoil-noexp.md b/docs/regressions-dl20-doc-segmented-unicoil-noexp.md index 419df463d5..106c9a6723 100644 --- a/docs/regressions-dl20-doc-segmented-unicoil-noexp.md +++ b/docs/regressions-dl20-doc-segmented-unicoil-noexp.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-doc-segmented-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil-noexp \ --corpus-path collections/msmarco-doc-segmented-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl20-doc-segmented-unicoil.md b/docs/regressions-dl20-doc-segmented-unicoil.md index 2247fbc143..e908120a1b 100644 --- a/docs/regressions-dl20-doc-segmented-unicoil.md +++ b/docs/regressions-dl20-doc-segmented-unicoil.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-doc-segmented-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-segmented-unicoil \ --corpus-path collections/msmarco-doc-segmented-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl20-passage-unicoil-noexp.md b/docs/regressions-dl20-passage-unicoil-noexp.md index 50395ec0eb..317f16f0c6 100644 --- a/docs/regressions-dl20-passage-unicoil-noexp.md +++ b/docs/regressions-dl20-passage-unicoil-noexp.md @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-unicoil-noexp \ --corpus-path collections/msmarco-passage-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl20-passage-unicoil.md b/docs/regressions-dl20-passage-unicoil.md index 8de5d646c5..2d11316e2d 100644 --- a/docs/regressions-dl20-passage-unicoil.md +++ b/docs/regressions-dl20-passage-unicoil.md @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-unicoil \ --corpus-path collections/msmarco-passage-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md b/docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md index 7a1296ebf0..f6cac75cbd 100644 --- a/docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md +++ b/docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md @@ -23,15 +23,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ @@ -43,12 +54,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-0shot-v2 +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-0shot-v2 \ @@ -58,7 +75,7 @@ target/appassembler/bin/IndexCollection \ >& logs/log.msmarco-v2-doc-segmented-unicoil-0shot-v2 & ``` -The path `/path/to/msmarco-v2-doc-segmented-unicoil-0shot/` should point to the corpus downloaded above. +The path `/path/to/msmarco-v2-doc-segmented-unicoil-0shot-v2/` should point to the corpus downloaded above. The important indexing options to note here are `-impact -pretokenized`: the first tells Anserini not to encode BM25 doclengths into Lucene's norms (which is the default) and the second option says not to apply any additional tokenization on the uniCOIL tokens. Upon completion, we should have an index with 124,131,414 documents. @@ -73,7 +90,7 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2/ \ -topics src/main/resources/topics-and-qrels/topics.dl21.unicoil.0shot.tsv.gz \ @@ -84,7 +101,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.dl21.unicoil.0shot.txt diff --git a/docs/regressions-dl21-doc-segmented-unicoil-0shot.md b/docs/regressions-dl21-doc-segmented-unicoil-0shot.md index 61117937de..ec29fae92c 100644 --- a/docs/regressions-dl21-doc-segmented-unicoil-0shot.md +++ b/docs/regressions-dl21-doc-segmented-unicoil-0shot.md @@ -22,15 +22,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-0shot ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-doc-segmented-unicoil-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ @@ -42,12 +53,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-0shot \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-0shot \ @@ -72,7 +89,7 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.dl21.unicoil.0shot.tsv.gz \ @@ -83,7 +100,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt diff --git a/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md b/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md index ac9b75e22f..b7ca97fda9 100644 --- a/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md +++ b/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md @@ -23,15 +23,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -43,12 +54,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 \ @@ -58,7 +75,7 @@ target/appassembler/bin/IndexCollection \ >& logs/log.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 & ``` -The path `/path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot/` should point to the corpus downloaded above. +The path `/path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/` should point to the corpus downloaded above. The important indexing options to note here are `-impact -pretokenized`: the first tells Anserini not to encode BM25 doclengths into Lucene's norms (which is the default) and the second option says not to apply any additional tokenization on the uniCOIL tokens. Upon completion, we should have an index with 124,131,404 documents. @@ -73,7 +90,7 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/ \ -topics src/main/resources/topics-and-qrels/topics.dl21.unicoil-noexp.0shot.tsv.gz \ @@ -84,7 +101,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt diff --git a/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot.md b/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot.md index cd34f6c92e..e54c96f80d 100644 --- a/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot.md +++ b/docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot.md @@ -22,15 +22,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -42,12 +53,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot \ @@ -72,7 +89,7 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.dl21.unicoil-noexp.0shot.tsv.gz \ @@ -83,7 +100,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.dl21-doc.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.dl21.unicoil-noexp.0shot.txt diff --git a/docs/regressions-dl21-passage-unicoil-0shot.md b/docs/regressions-dl21-passage-unicoil-0shot.md index 826cf775b5..bc35d2c913 100644 --- a/docs/regressions-dl21-passage-unicoil-0shot.md +++ b/docs/regressions-dl21-passage-unicoil-0shot.md @@ -17,15 +17,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-0shot ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-passage-unicoil-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ @@ -37,12 +48,18 @@ mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-u ``` To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-0shot \ + --corpus-path collections/msmarco-v2-passage-unicoil-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-passage-unicoil-0shot \ @@ -67,7 +84,7 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-passage-unicoil-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.dl21.unicoil.0shot.tsv.gz \ @@ -78,7 +95,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -l 2 src/main/resources/topics-and-qrels/qrels.dl21-passage.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m recip_rank -l 2 src/main/resources/topics-and-qrels/qrels.dl21-passage.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.10 src/main/resources/topics-and-qrels/qrels.dl21-passage.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.dl21.unicoil.0shot.txt diff --git a/docs/regressions-dl21-passage-unicoil-noexp-0shot.md b/docs/regressions-dl21-passage-unicoil-noexp-0shot.md index 8914dfed79..845ba2a64c 100644 --- a/docs/regressions-dl21-passage-unicoil-noexp-0shot.md +++ b/docs/regressions-dl21-passage-unicoil-noexp-0shot.md @@ -17,15 +17,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-noexp-0shot ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl21-passage-unicoil-noexp-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ @@ -37,6 +48,12 @@ mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-pas ``` To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-noexp-0shot \ + --corpus-path collections/msmarco-v2-passage-unicoil-noexp-0shot +``` ## Indexing diff --git a/docs/regressions-msmarco-doc-segmented-unicoil-noexp.md b/docs/regressions-msmarco-doc-segmented-unicoil-noexp.md index 30244b7eee..adb9bac6fb 100644 --- a/docs/regressions-msmarco-doc-segmented-unicoil-noexp.md +++ b/docs/regressions-msmarco-doc-segmented-unicoil-noexp.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp \ --corpus-path collections/msmarco-doc-segmented-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-msmarco-doc-segmented-unicoil.md b/docs/regressions-msmarco-doc-segmented-unicoil.md index 645edb8ece..a13456693b 100644 --- a/docs/regressions-msmarco-doc-segmented-unicoil.md +++ b/docs/regressions-msmarco-doc-segmented-unicoil.md @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-doc-segmented-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil \ --corpus-path collections/msmarco-doc-segmented-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-msmarco-passage-unicoil-noexp.md b/docs/regressions-msmarco-passage-unicoil-noexp.md index 993d53d7c5..54f7b1f187 100644 --- a/docs/regressions-msmarco-passage-unicoil-noexp.md +++ b/docs/regressions-msmarco-passage-unicoil-noexp.md @@ -19,11 +19,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-unicoil-noexp ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-unicoil-noexp +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -33,16 +40,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-unicoil-noexp \ --corpus-path collections/msmarco-passage-unicoil-noexp ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-msmarco-passage-unicoil.md b/docs/regressions-msmarco-passage-unicoil.md index 1bba680865..d30fa85c9d 100644 --- a/docs/regressions-msmarco-passage-unicoil.md +++ b/docs/regressions-msmarco-passage-unicoil.md @@ -16,11 +16,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-unicoil ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-unicoil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -30,16 +37,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-unicoil \ --corpus-path collections/msmarco-passage-unicoil ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md b/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md index bc246ac91d..afa5cbcc2f 100644 --- a/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md +++ b/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md @@ -20,15 +20,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot-v2 ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot-v2 ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ @@ -40,12 +51,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot-v2 \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-0shot-v2 +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-0shot-v2 \ @@ -69,7 +86,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz \ @@ -86,7 +103,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot-v2.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt diff --git a/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot.md b/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot.md index dea6944c0c..7be83951dc 100644 --- a/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot.md +++ b/docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot.md @@ -19,15 +19,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ @@ -39,12 +50,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-0shot \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-0shot \ @@ -68,7 +85,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz \ @@ -85,7 +102,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-doc.dev.unicoil.0shot.txt diff --git a/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md b/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md index de64fc89a9..d843786ed9 100644 --- a/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md +++ b/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md @@ -20,15 +20,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -40,12 +51,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 \ @@ -69,7 +86,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz \ @@ -86,7 +103,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt diff --git a/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot.md b/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot.md index 6f8c8a1bd9..1b55757ee4 100644 --- a/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot.md +++ b/docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot.md @@ -19,15 +19,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -P collections/ @@ -39,12 +50,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot collections/msmarco- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar` is 54 GB and has an MD5 checksum of `28261587d6afde56efd8df4f950e7fb4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-segmented-unicoil-noexp-0shot \ + --corpus-path collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot \ @@ -68,7 +85,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz \ @@ -85,7 +102,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.txt diff --git a/docs/regressions-msmarco-v2-passage-unicoil-0shot.md b/docs/regressions-msmarco-v2-passage-unicoil-0shot.md index f4a6b55364..983ff2501c 100644 --- a/docs/regressions-msmarco-v2-passage-unicoil-0shot.md +++ b/docs/regressions-msmarco-v2-passage-unicoil-0shot.md @@ -14,15 +14,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-0shot ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-passage-unicoil-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ @@ -34,12 +45,18 @@ mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-u ``` To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-0shot \ + --corpus-path collections/msmarco-v2-passage-unicoil-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-passage-unicoil-0shot \ @@ -63,7 +80,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-passage-unicoil-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz \ @@ -80,7 +97,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-passage.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-passage.dev.unicoil.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-0shot.unicoil-0shot.topics.msmarco-v2-passage.dev.unicoil.0shot.txt diff --git a/docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md b/docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md index 0a81582abe..5233b681e9 100644 --- a/docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md +++ b/docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md @@ -14,15 +14,26 @@ Note that this page is automatically generated from [this template](../src/main/ From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-noexp-0shot ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v2-passage-unicoil-noexp-0shot ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ @@ -34,12 +45,18 @@ mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-pas ``` To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-noexp-0shot \ + --corpus-path collections/msmarco-v2-passage-unicoil-noexp-0shot +``` ## Indexing Sample indexing command: -``` +```bash target/appassembler/bin/IndexCollection \ -collection JsonVectorCollection \ -input /path/to/msmarco-v2-passage-unicoil-noexp-0shot \ @@ -63,7 +80,7 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash target/appassembler/bin/SearchCollection \ -index indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot/ \ -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz \ @@ -80,7 +97,7 @@ target/appassembler/bin/SearchCollection \ Evaluation can be performed using `trec_eval`: -``` +```bash tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.txt tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -c -M 100 -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-unicoil-noexp-0shot.unicoil-noexp-0shot.topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.txt diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index e68d6f2c08..b7f4127cdc 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -16,16 +16,21 @@ from __future__ import print_function -import itertools -import sys - import argparse +import hashlib +import itertools import logging import os -import yaml +import re +import stat +import tarfile + from multiprocessing import Pool from subprocess import call, Popen, PIPE +from urllib.request import urlretrieve +import yaml +from tqdm import tqdm logger = logging.getLogger('regression_test') logger.setLevel(logging.INFO) @@ -177,10 +182,77 @@ def run_search(cmd): call(' '.join(cmd), shell=True) +# https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 +class TqdmUpTo(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) # will also set self.n = b * bsize + + +# For large files, we need to compute MD5 block by block. See: +# https://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python +def compute_md5(file, block_size=2**20): + m = hashlib.md5() + with open(file, 'rb') as f: + while True: + buf = f.read(block_size) + if not buf: + break + m.update(buf) + return m.hexdigest() + + +def download_url(url, save_dir, local_filename=None, md5=None, force=False, verbose=True): + # If caller does not specify local filename, figure it out from the download URL: + if not local_filename: + filename = url.split('/')[-1] + filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + else: + # Otherwise, use the specified local_filename: + filename = local_filename + + destination_path = os.path.join(save_dir, filename) + + if verbose: + logger.info(f'Downloading {url} to {destination_path}...') + + # Check to see if file already exists, if so, simply return (quietly) unless force=True, in which case we remove + # destination file and download fresh copy. + if os.path.exists(destination_path): + if verbose: + logger.info(f'{destination_path} already exists!') + if not force: + if verbose: + logger.info(f'Skipping download.') + return destination_path + if verbose: + logger.info(f'force=True, removing {destination_path}; fetching fresh copy...') + os.remove(destination_path) + + with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t: + urlretrieve(url, filename=destination_path, reporthook=t.update_to) + + if md5: + md5_computed = compute_md5(destination_path) + assert md5_computed == md5, f'{destination_path} does not match checksum! Expecting {md5} got {md5_computed}.' + + return destination_path + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run Anserini regression tests.') parser.add_argument('--regression', required=True, help='Name of the regression test.') parser.add_argument('--corpus-path', dest='corpus_path', default='', help='Override corpus path from YAML') + parser.add_argument('--download', dest='download', action='store_true', help='Build index.') parser.add_argument('--index', dest='index', action='store_true', help='Build index.') parser.add_argument('--index-threads', type=int, default=-1, help='Override number of indexing threads from YAML') parser.add_argument('--verify', dest='verify', action='store_true', help='Verify index statistics.') @@ -194,6 +266,32 @@ def run_search(cmd): with open('src/main/resources/regression/{}.yaml'.format(args.regression)) as f: yaml_data = yaml.safe_load(f) + if args.download: + logger.info('='*10 + ' Downloading Corpus ' + '='*10) + if not yaml_data['download_url']: + raise ValueError('Corpus download URL known!') + url = yaml_data['download_url'] + download_url(url, 'collections', md5=yaml_data['download_checksum']) + + filename = url.split('/')[-1] + local_tarball = os.path.join('collections', filename) + logger.info(f'Extracting {local_tarball}...') + tarball = tarfile.open(local_tarball) + tarball.extractall('collections') + tarball.close() + + # e.g., MS MARCO V2: need to rename the corpus + if 'download_corpus' in yaml_data: + src = os.path.join('collections', yaml_data['download_corpus']) + dest = os.path.join('collections', yaml_data['corpus']) + logger.info(f'Renaming {src} to {dest}') + os.chmod(src, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) + os.rename(src, dest) + + path = os.path.join('collections', yaml_data['corpus']) + logger.info(f'Corpus path is {path}') + args.corpus_path = path + # Build indexes. if args.index: logger.info('='*10 + ' Indexing ' + '='*10) diff --git a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template index 17b220afad..f17dc6a706 100644 --- a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template index 2f8610d0e7..e2f27ced45 100644 --- a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template index c012e6fb2d..4b13b3446b 100644 --- a/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl19-passage-unicoil.template b/src/main/resources/docgen/templates/dl19-passage-unicoil.template index 987a022637..fd038c60fd 100644 --- a/src/main/resources/docgen/templates/dl19-passage-unicoil.template +++ b/src/main/resources/docgen/templates/dl19-passage-unicoil.template @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template index 42cd87c4ac..22f4b3f2b6 100644 --- a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template index 83b69beca4..bbd6489e7f 100644 --- a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template index e38f7df879..2e2142acaa 100644 --- a/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl20-passage-unicoil.template b/src/main/resources/docgen/templates/dl20-passage-unicoil.template index 9905eb9b38..9457f31e87 100644 --- a/src/main/resources/docgen/templates/dl20-passage-unicoil.template +++ b/src/main/resources/docgen/templates/dl20-passage-unicoil.template @@ -22,11 +22,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -36,16 +43,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template index 376c778c04..24db4231d8 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template @@ -23,15 +23,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ @@ -43,16 +54,22 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` -The path `/path/to/msmarco-v2-doc-segmented-unicoil-0shot/` should point to the corpus downloaded above. +The path `/path/to/msmarco-v2-doc-segmented-unicoil-0shot-v2/` should point to the corpus downloaded above. The important indexing options to note here are `-impact -pretokenized`: the first tells Anserini not to encode BM25 doclengths into Lucene's norms (which is the default) and the second option says not to apply any additional tokenization on the uniCOIL tokens. Upon completion, we should have an index with 124,131,414 documents. @@ -67,13 +84,13 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template index 0bbd29928e..2b939bc0ff 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template @@ -22,15 +22,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ @@ -42,12 +53,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -66,13 +83,13 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template index 25ca6a6111..f461cb4921 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template @@ -23,15 +23,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -43,16 +54,22 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` -The path `/path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot/` should point to the corpus downloaded above. +The path `/path/to/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/` should point to the corpus downloaded above. The important indexing options to note here are `-impact -pretokenized`: the first tells Anserini not to encode BM25 doclengths into Lucene's norms (which is the default) and the second option says not to apply any additional tokenization on the uniCOIL tokens. Upon completion, we should have an index with 124,131,404 documents. @@ -67,13 +84,13 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template index 084e87e199..96f39124a2 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template @@ -22,15 +22,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -42,12 +53,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -66,13 +83,13 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template b/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template index 1b25a77d9a..f94e21ab2e 100644 --- a/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template @@ -17,15 +17,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ @@ -37,12 +48,18 @@ mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-u ``` To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -61,13 +78,13 @@ The original data can be found [here](https://trec.nist.gov/data/deep2021.html). After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template index 32c95aab3f..1aafe2dfa6 100644 --- a/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template @@ -17,15 +17,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ @@ -37,6 +48,12 @@ mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-pas ``` To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing diff --git a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template index 28231b6442..7b3c439d95 100644 --- a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template index 4a2885906b..166eb6e3d3 100644 --- a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template @@ -20,11 +20,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO segmented document corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -34,16 +41,13 @@ tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ ``` To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template index 0374f46ac3..b57eccd4d5 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template @@ -19,11 +19,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -33,16 +40,13 @@ tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/msmarco-passage-unicoil.template b/src/main/resources/docgen/templates/msmarco-passage-unicoil.template index cd5cf4bf68..96f26c9b4e 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-unicoil.template +++ b/src/main/resources/docgen/templates/msmarco-passage-unicoil.template @@ -16,11 +16,18 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus Download - -We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., gone through document expansion and term reweighting. +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. Thus, no neural inference is involved. -For details on how to train uniCOIL and perform inference, please see [this guide](https://github.com/luyug/COIL/tree/main/uniCOIL). + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download Download the corpus and unpack into `collections/`: @@ -30,16 +37,13 @@ tar xvf collections/msmarco-passage-unicoil.tar -C collections/ ``` To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. - -With the corpus downloaded, the following command will perform the complete regression, end to end, on any machine: +With the corpus downloaded, the following command will perform the remaining steps below: ```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ --corpus-path collections/${corpus} ``` -Alternatively, you can simply copy/paste from the commands below and obtain the same results. - ## Indexing Sample indexing command: diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template index 2b57ada9d7..89445a029e 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template @@ -20,15 +20,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ @@ -40,12 +51,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -63,13 +80,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template index 1163eae014..a448d0ed42 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template @@ -19,15 +19,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ @@ -39,12 +50,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc ``` To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -62,13 +79,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template index 2e902baf47..013347cdb8 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template @@ -20,15 +20,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ @@ -40,12 +51,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmar ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -63,13 +80,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template index 32709bbaeb..03699a3cc8 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template @@ -19,15 +19,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO document corpus that has already been processed with uniCOIL (per above), i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -P collections/ @@ -39,12 +50,18 @@ mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot collections/msmarco- ``` To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar` is 54 GB and has an MD5 checksum of `28261587d6afde56efd8df4f950e7fb4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -62,13 +79,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template index 8d91c6d09b..a5e456ede1 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template @@ -14,15 +14,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have applied doc2query-T5 expansions, performed model inference on every document, and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ @@ -34,12 +45,18 @@ mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-u ``` To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -57,13 +74,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template index 1adb7015ce..24b7afd12b 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template @@ -14,15 +14,26 @@ Note that this page is automatically generated from [this template](${template}) From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: -``` +```bash python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -## Corpus +We make available a version of the MS MARCO passage corpus that has already been processed with uniCOIL, i.e., we have performed model inference on every document and stored the output sparse vectors. +Thus, no neural inference is involved. -Download, unpack, and prepare the corpus: +From any machine, the following command will download the corpus and perform the complete regression, end to end: +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} ``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download, unpack, and prepare the corpus: + +```bash # Download wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ @@ -34,12 +45,18 @@ mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-pas ``` To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` ## Indexing Sample indexing command: -``` +```bash ${index_cmds} ``` @@ -57,13 +74,13 @@ These regression experiments use the [dev queries](../src/main/resources/topics- After indexing has completed, you should be able to perform retrieval as follows: -``` +```bash ${ranking_cmds} ``` Evaluation can be performed using `trec_eval`: -``` +```bash ${eval_cmds} ``` diff --git a/src/main/resources/regression/dl19-doc-segmented-unicoil-noexp.yaml b/src/main/resources/regression/dl19-doc-segmented-unicoil-noexp.yaml index 017ce91548..e24df19f20 100644 --- a/src/main/resources/regression/dl19-doc-segmented-unicoil-noexp.yaml +++ b/src/main/resources/regression/dl19-doc-segmented-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil-noexp corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar +download_checksum: 11b226e1cacd9c8ae0a660fd14cdd710 + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl19-doc-segmented-unicoil.yaml b/src/main/resources/regression/dl19-doc-segmented-unicoil.yaml index c526ed22bc..12088c713f 100644 --- a/src/main/resources/regression/dl19-doc-segmented-unicoil.yaml +++ b/src/main/resources/regression/dl19-doc-segmented-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar +download_checksum: 6a00e2c0c375cb1e52c83ae5ac377ebb + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl19-passage-unicoil-noexp.yaml b/src/main/resources/regression/dl19-passage-unicoil-noexp.yaml index 870c3d20cc..9e16a051fa 100644 --- a/src/main/resources/regression/dl19-passage-unicoil-noexp.yaml +++ b/src/main/resources/regression/dl19-passage-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil-noexp corpus_path: collections/msmarco/msmarco-passage-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar +download_checksum: f17ddd8c7c00ff121c3c3b147d2e17d8 + index_path: indexes/lucene-index.msmarco-passage-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl19-passage-unicoil.yaml b/src/main/resources/regression/dl19-passage-unicoil.yaml index de19dc6ce2..ffadde3e52 100644 --- a/src/main/resources/regression/dl19-passage-unicoil.yaml +++ b/src/main/resources/regression/dl19-passage-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil corpus_path: collections/msmarco/msmarco-passage-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar +download_checksum: 78eef752c78c8691f7d61600ceed306f + index_path: indexes/lucene-index.msmarco-passage-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl20-doc-segmented-unicoil-noexp.yaml b/src/main/resources/regression/dl20-doc-segmented-unicoil-noexp.yaml index adf0aeacf1..995d552c05 100644 --- a/src/main/resources/regression/dl20-doc-segmented-unicoil-noexp.yaml +++ b/src/main/resources/regression/dl20-doc-segmented-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil-noexp corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar +download_checksum: 11b226e1cacd9c8ae0a660fd14cdd710 + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl20-doc-segmented-unicoil.yaml b/src/main/resources/regression/dl20-doc-segmented-unicoil.yaml index 306ea01f2d..7e6328312d 100644 --- a/src/main/resources/regression/dl20-doc-segmented-unicoil.yaml +++ b/src/main/resources/regression/dl20-doc-segmented-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar +download_checksum: 6a00e2c0c375cb1e52c83ae5ac377ebb + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl20-passage-unicoil-noexp.yaml b/src/main/resources/regression/dl20-passage-unicoil-noexp.yaml index 99e7b75162..bee354aa32 100644 --- a/src/main/resources/regression/dl20-passage-unicoil-noexp.yaml +++ b/src/main/resources/regression/dl20-passage-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil-noexp corpus_path: collections/msmarco/msmarco-passage-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar +download_checksum: f17ddd8c7c00ff121c3c3b147d2e17d8 + index_path: indexes/lucene-index.msmarco-passage-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl20-passage-unicoil.yaml b/src/main/resources/regression/dl20-passage-unicoil.yaml index e4fe7ccf01..6abd02df96 100644 --- a/src/main/resources/regression/dl20-passage-unicoil.yaml +++ b/src/main/resources/regression/dl20-passage-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil corpus_path: collections/msmarco/msmarco-passage-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar +download_checksum: 78eef752c78c8691f7d61600ceed306f + index_path: indexes/lucene-index.msmarco-passage-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot-v2.yaml b/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot-v2.yaml index 3ac6fb99e5..1e9b4465a4 100644 --- a/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot-v2.yaml +++ b/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot-v2.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-0shot-v2 corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot_v2/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar +download_checksum: c5639748c2cbad0152e10b0ebde3b804 +download_corpus: msmarco_v2_doc_segmented_unicoil_0shot_v2 + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot.yaml b/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot.yaml index fc258cae15..614b4c6440 100644 --- a/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot.yaml +++ b/src/main/resources/regression/dl21-doc-segmented-unicoil-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-0shot corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar +download_checksum: 889db095113cc4fe152382ccff73304a +download_corpus: msmarco_v2_doc_segmented_unicoil_0shot + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot-v2.yaml b/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot-v2.yaml index 793fab46f4..092c2d1782 100644 --- a/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot-v2.yaml +++ b/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot-v2.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar +download_checksum: 97ba262c497164de1054f357caea0c63 +download_corpus: msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot.yaml b/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot.yaml index 3a7a4953b9..0bb953ff4d 100644 --- a/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot.yaml +++ b/src/main/resources/regression/dl21-doc-segmented-unicoil-noexp-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-noexp-0shot corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar +download_checksum: 28261587d6afde56efd8df4f950e7fb4 +download_corpus: msmarco_v2_doc_segmented_unicoil_noexp_0shot + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-passage-unicoil-0shot.yaml b/src/main/resources/regression/dl21-passage-unicoil-0shot.yaml index 8980b6debd..e46b4f1c1a 100644 --- a/src/main/resources/regression/dl21-passage-unicoil-0shot.yaml +++ b/src/main/resources/regression/dl21-passage-unicoil-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-passage-unicoil-0shot corpus_path: collections/msmarco/msmarco_v2_passage_unicoil_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar +download_checksum: 1949a00bfd5e1f1a230a04bbc1f01539 +download_corpus: msmarco_v2_passage_unicoil_0shot + index_path: indexes/lucene-index.msmarco-v2-passage-unicoil-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/dl21-passage-unicoil-noexp-0shot.yaml b/src/main/resources/regression/dl21-passage-unicoil-noexp-0shot.yaml index e2e3f151c5..bf0c8c7939 100644 --- a/src/main/resources/regression/dl21-passage-unicoil-noexp-0shot.yaml +++ b/src/main/resources/regression/dl21-passage-unicoil-noexp-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-passage-unicoil-noexp-0shot corpus_path: collections/msmarco/msmarco_v2_passage_unicoil_noexp_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar +download_checksum: d9cc1ed3049746e68a2c91bf90e5212d +download_corpus: msmarco_v2_passage_unicoil_noexp_0shot + index_path: indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-doc-segmented-unicoil-noexp.yaml b/src/main/resources/regression/msmarco-doc-segmented-unicoil-noexp.yaml index 47f8989d78..320f9e9c46 100644 --- a/src/main/resources/regression/msmarco-doc-segmented-unicoil-noexp.yaml +++ b/src/main/resources/regression/msmarco-doc-segmented-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil-noexp corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar +download_checksum: 11b226e1cacd9c8ae0a660fd14cdd710 + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-doc-segmented-unicoil.yaml b/src/main/resources/regression/msmarco-doc-segmented-unicoil.yaml index a09adedab5..18895c13b4 100644 --- a/src/main/resources/regression/msmarco-doc-segmented-unicoil.yaml +++ b/src/main/resources/regression/msmarco-doc-segmented-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-doc-segmented-unicoil corpus_path: collections/msmarco/msmarco-doc-segmented-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar +download_checksum: 6a00e2c0c375cb1e52c83ae5ac377ebb + index_path: indexes/lucene-index.msmarco-doc-segmented-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-passage-unicoil-noexp.yaml b/src/main/resources/regression/msmarco-passage-unicoil-noexp.yaml index a764e01e55..2798dbb1c7 100644 --- a/src/main/resources/regression/msmarco-passage-unicoil-noexp.yaml +++ b/src/main/resources/regression/msmarco-passage-unicoil-noexp.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil-noexp corpus_path: collections/msmarco/msmarco-passage-unicoil-noexp/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar +download_checksum: f17ddd8c7c00ff121c3c3b147d2e17d8 + index_path: indexes/lucene-index.msmarco-passage-unicoil-noexp/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-passage-unicoil.yaml b/src/main/resources/regression/msmarco-passage-unicoil.yaml index 06b7c025db..e5fda2f78f 100644 --- a/src/main/resources/regression/msmarco-passage-unicoil.yaml +++ b/src/main/resources/regression/msmarco-passage-unicoil.yaml @@ -2,6 +2,9 @@ corpus: msmarco-passage-unicoil corpus_path: collections/msmarco/msmarco-passage-unicoil/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar +download_checksum: 78eef752c78c8691f7d61600ceed306f + index_path: indexes/lucene-index.msmarco-passage-unicoil/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot-v2.yaml b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot-v2.yaml index e6a8728f30..ab00af36b9 100644 --- a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot-v2.yaml +++ b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot-v2.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-0shot-v2 corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot_v2/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar +download_checksum: c5639748c2cbad0152e10b0ebde3b804 +download_corpus: msmarco_v2_doc_segmented_unicoil_0shot_v2 + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot-v2/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot.yaml b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot.yaml index 0c46e799b4..8e4e71e603 100644 --- a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot.yaml +++ b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-0shot corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar +download_checksum: 889db095113cc4fe152382ccff73304a +download_corpus: msmarco_v2_doc_segmented_unicoil_0shot + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.yaml b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.yaml index 3a2aab6e98..86e623a7ed 100644 --- a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.yaml +++ b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar +download_checksum: 97ba262c497164de1054f357caea0c63 +download_corpus: msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot.yaml b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot.yaml index fb20eba3d2..e2bf5d87eb 100644 --- a/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot.yaml +++ b/src/main/resources/regression/msmarco-v2-doc-segmented-unicoil-noexp-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-doc-segmented-unicoil-noexp-0shot corpus_path: collections/msmarco/msmarco_v2_doc_segmented_unicoil_noexp_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar +download_checksum: 28261587d6afde56efd8df4f950e7fb4 +download_corpus: msmarco_v2_doc_segmented_unicoil_noexp_0shot + index_path: indexes/lucene-index.msmarco-v2-doc-segmented-unicoil-noexp-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-passage-unicoil-0shot.yaml b/src/main/resources/regression/msmarco-v2-passage-unicoil-0shot.yaml index 44eae58b22..fc15f863cc 100644 --- a/src/main/resources/regression/msmarco-v2-passage-unicoil-0shot.yaml +++ b/src/main/resources/regression/msmarco-v2-passage-unicoil-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-passage-unicoil-0shot corpus_path: collections/msmarco/msmarco_v2_passage_unicoil_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar +download_checksum: 1949a00bfd5e1f1a230a04bbc1f01539 +download_corpus: msmarco_v2_passage_unicoil_0shot + index_path: indexes/lucene-index.msmarco-v2-passage-unicoil-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/main/resources/regression/msmarco-v2-passage-unicoil-noexp-0shot.yaml b/src/main/resources/regression/msmarco-v2-passage-unicoil-noexp-0shot.yaml index 8218eacf36..04da480dc0 100644 --- a/src/main/resources/regression/msmarco-v2-passage-unicoil-noexp-0shot.yaml +++ b/src/main/resources/regression/msmarco-v2-passage-unicoil-noexp-0shot.yaml @@ -2,6 +2,10 @@ corpus: msmarco-v2-passage-unicoil-noexp-0shot corpus_path: collections/msmarco/msmarco_v2_passage_unicoil_noexp_0shot/ +download_url: https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar +download_checksum: d9cc1ed3049746e68a2c91bf90e5212d +download_corpus: msmarco_v2_passage_unicoil_noexp_0shot + index_path: indexes/lucene-index.msmarco-v2-passage-unicoil-noexp-0shot/ collection_class: JsonVectorCollection generator_class: DefaultLuceneDocumentGenerator diff --git a/src/test/java/io/anserini/doc/DataModel.java b/src/test/java/io/anserini/doc/DataModel.java index 99a91213f3..eede3a99ed 100755 --- a/src/test/java/io/anserini/doc/DataModel.java +++ b/src/test/java/io/anserini/doc/DataModel.java @@ -47,6 +47,34 @@ public void setCorpus_path(String corpus_path) { this.corpus_path = corpus_path; } + private String download_url; + private String download_checksum; + private String download_corpus; + + public String getDownload_url() { + return download_url; + } + + public void setDownload_url(String download_url) { + this.download_url = download_url; + } + + public String getDownload_checksum() { + return download_checksum; + } + + public void setDownload_checksum(String download_checksum) { + this.download_checksum = download_checksum; + } + + public String getDownload_corpus() { + return download_corpus; + } + + public void setDownload_corpus(String download_corpus) { + this.download_corpus = download_corpus; + } + private String index_path; private String collection_class; private String generator_class;