feat!: Use VRS 2.0 models as primary output (#57)

ave-dcd · Sep 16, 2024 · e491d7c · e491d7c
1 parent 5fa9f29
commit e491d7c
Show file tree

Hide file tree

Showing 26 changed files with 2,049 additions and 1,705 deletions.
diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
@@ -21,7 +21,7 @@ jobs:
 
       - name: Run tests
         # only run tests known to work in CI
-        run: python3 -m pytest tests/test_mavedb_data.py tests/test_vrs_map.py
+        run: python3 -m pytest tests/test_mavedb_data.py tests/test_vrs_map.py tests/test_annotate.py
   lint:
     runs-on: ubuntu-latest
     steps:

diff --git a/.gitignore b/.gitignore
@@ -165,3 +165,4 @@ cython_debug/
 # mapping data/output
 notebooks/analysis/analysis_files
 notebooks/analysis/mavedb_files
+**/blat
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 <!-- description -->
 
-This library implements a novel method for mapping [MaveDB scoreset data](https://mavedb.org/) to [GA4GH Variation Representation Specification (VRS)](https://vrs.ga4gh.org/en/stable/) objects, enhancing interoperability for genomic medicine applications. See [Arbesfeld et. al. (2023)](https://www.biorxiv.org/content/10.1101/2023.06.20.545702v1) for a preprint edition of the mapping manuscript, or [download the resulting mappings directly](https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz).
+This library implements a novel method for mapping [MaveDB scoreset data](https://mavedb.org/) to [GA4GH Variation Representation Specification (VRS 2.0)](https://vrs.ga4gh.org/en/2.x/) objects, enhancing interoperability for genomic medicine applications. See [Arbesfeld et. al. (2023)](https://www.biorxiv.org/content/10.1101/2023.06.20.545702v1) for a preprint edition of the mapping manuscript, or [download the resulting mappings directly](https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz).
 
 <!-- /description -->
 
@@ -17,7 +17,7 @@ This library implements a novel method for mapping [MaveDB scoreset data](https:
 * Universal Transcript Archive (UTA): see [README](https://github.com/biocommons/uta?tab=readme-ov-file#installing-uta-locally) for setup instructions. Users with access to Docker on their local devices can use the available Docker image; otherwise, start a relatively recent (version 14+) PostgreSQL instance and add data from the available database dump.
 * SeqRepo: see [README](https://github.com/biocommons/biocommons.seqrepo?tab=readme-ov-file#requirements) for setup instructions. The SeqRepo data directory must be writeable; see specific instructions [here](https://github.com/biocommons/biocommons.seqrepo/blob/main/docs/store.rst) for more.
 * Gene Normalizer: see [documentation](https://gene-normalizer.readthedocs.io/0.3.0-dev1/install.html) for data setup instructions.
-* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions. 
+* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions.
 
 
 ## Installation

diff --git a/notebooks/analysis/README.md b/notebooks/analysis/README.md
@@ -6,6 +6,8 @@ Code for data analysis and figure generation for ["Mapping MAVE data for use in
 * [`mapping_analysis.ipynb`](mapping_analysis.ipynb): This notebook computes reference sequence concordance across the generated VRS mapping pairs. The notebook also computes the number of unique pre-mapped and post-mapped variants.
 * [`mavedb_scoreset_breakdown.ipynb`](mavedb_scoreset_breakdown.ipynb): This notebook generates the summary statistics that are described in the manuscript.
 
+**Note that these notebooks may not be using the most recent release of the `dcd_mapping` library** -- they are intended to reflect the state of the code at the time of artifact generation, without any features that have been added since. The included `requirements.txt` file should produce an environment matching this expectation.
+
 ## Environment
 
 A compatible Python environment can be generated using the included `requirements.txt` file.

diff --git a/notebooks/analysis/requirements.txt b/notebooks/analysis/requirements.txt
@@ -32,7 +32,7 @@ comm==0.2.2
 configparser==7.0.0
 cool_seq_tool==0.4.0.dev3
 coverage==7.4.4
--e git+https://github.com/ave-dcd/dcd_mapping/@d00598d6a7535fbe85c567ab56e264e4d3255d4b#egg=dcd_mapping
+dcd_mapping==0.1.3
 debugpy==1.8.1
 decorator==5.1.1
 defusedxml==0.7.1

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,13 +29,14 @@ dependencies = [
     "biopython",
     "tqdm",
     "click",
-    "cool-seq-tool==0.4.0.dev3",
-    "ga4gh.vrs==2.0.0-a6",
+    "cool-seq-tool~=0.6.0",
+    "ga4gh.vrs==2.0.0-a10",
     # probably easiest to just include pg dependency group even if it's not always necessary
-    "gene-normalizer[pg]==0.3.0-dev1",
+    "gene-normalizer[pg]~=0.4.1",
     "pydantic~=2.0",
     "python-dotenv",
     "setuptools>=68.0",  # tmp -- ensure 3.12 compatibility
+    "canonicaljson",
 ]
 dynamic = ["version"]
 
@@ -164,7 +165,8 @@ ignore = [
 # ANN102 - missing-type-cls
 # S101 - assert
 # B011 - assert-false
-"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D103"]
+# INP001 - implicit-namespace-package
+"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D103", "INP001"]
 
 [tool.ruff.format]
 docstring-code-format = true