diff --git a/.travis.yml b/.travis.yml index 9b2b7b1..f24a6f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,117 +23,117 @@ matrix: - pip3 install -r test_requirements.txt - pip3 install . - # - os: linux - # python: "3.8-dev" - # install: - # - pip3 install -r requirements.txt - # - pip3 install -r test_requirements.txt - # - pip3 install . + - os: linux + python: "3.8-dev" + install: + - pip3 install -r requirements.txt + - pip3 install -r test_requirements.txt + - pip3 install . - # - os: linux - # python: "3.9-dev" - # install: - # - pip3 install -r requirements.txt - # - pip3 install -r test_requirements.txt - # - pip3 install . + - os: linux + python: "3.9-dev" + install: + - pip3 install -r requirements.txt + - pip3 install -r test_requirements.txt + - pip3 install . - # - os: linux - # python: "pypy3" - # install: - # - pip3 install -r requirements.txt - # - pip3 install -r test_requirements.txt - # - pip3 install . + - os: linux + python: "pypy3" + install: + - pip3 install -r requirements.txt + - pip3 install -r test_requirements.txt + - pip3 install . - # - name: "Python 3.6.6 OSX" - # os: osx - # language: generic - # env: PYTHON=3.6.6' - # before_install: - # - brew update - # - brew install openssl - # - brew install zlib - # - brew outdated pyenv || brew upgrade pyenv - # - brew install pyenv-virtualenv - # - pyenv install 3.6.6 - # - export PYENV_VERSION=3.6.6 - # - export PATH="/Users/travis/.pyenv/shims:${PATH}" - # - pyenv virtualenv venv - # - source /Users/travis/.pyenv/versions/venv/bin/activate - # - python --version - # install: - # - python -m pip install -U pip - # - sudo pip install setuptools - # - sudo pip install -r requirements.txt -e . - # - sudo pip install -r test_requirements.txt -e . + - name: "Python 3.6.6 OSX" + os: osx + language: generic + env: PYTHON=3.6.6' + before_install: + - brew update + - brew install openssl + - brew install zlib + - brew outdated pyenv || brew upgrade pyenv + - brew install pyenv-virtualenv + - pyenv install 3.6.6 + - export PYENV_VERSION=3.6.6 + - export PATH="/Users/travis/.pyenv/shims:${PATH}" + - pyenv virtualenv venv + - source /Users/travis/.pyenv/versions/venv/bin/activate + - python --version + install: + - python -m pip install -U pip + - sudo pip install setuptools + - sudo pip install -r requirements.txt -e . + - sudo pip install -r test_requirements.txt -e . - # - name: "Python 3.7.7 OSX" - # os: osx - # language: generic - # env: PYTHON=3.7.7' - # before_install: - # - brew update - # - brew install openssl - # - brew install zlib - # - brew outdated pyenv || brew upgrade pyenv - # - brew install pyenv-virtualenv - # - pyenv install 3.7.7 - # - export PYENV_VERSION=3.7.7 - # - export PATH="/Users/travis/.pyenv/shims:${PATH}" - # - pyenv virtualenv venv - # - source /Users/travis/.pyenv/versions/venv/bin/activate - # - python --version - # install: - # - python -m pip install -U pip - # - sudo pip install setuptools - # - sudo pip install -r requirements.txt -e . - # - sudo pip install -r test_requirements.txt -e . + - name: "Python 3.7.7 OSX" + os: osx + language: generic + env: PYTHON=3.7.7' + before_install: + - brew update + - brew install openssl + - brew install zlib + - brew outdated pyenv || brew upgrade pyenv + - brew install pyenv-virtualenv + - pyenv install 3.7.7 + - export PYENV_VERSION=3.7.7 + - export PATH="/Users/travis/.pyenv/shims:${PATH}" + - pyenv virtualenv venv + - source /Users/travis/.pyenv/versions/venv/bin/activate + - python --version + install: + - python -m pip install -U pip + - sudo pip install setuptools + - sudo pip install -r requirements.txt -e . + - sudo pip install -r test_requirements.txt -e . - # - name: "Python 3.8.2 OSX" - # os: osx - # language: generic - # env: PYTHON=3.8.2' - # before_install: - # - brew update - # - brew install openssl - # - brew install zlib - # - brew outdated pyenv || brew upgrade pyenv - # - brew install pyenv-virtualenv - # - pyenv install 3.8.2 - # - export PYENV_VERSION=3.8.2 - # - export PATH="/Users/travis/.pyenv/shims:${PATH}" - # - pyenv virtualenv venv - # - source /Users/travis/.pyenv/versions/venv/bin/activate - # - python --version - # install: - # - python -m pip install -U pip - # - sudo pip install setuptools - # - sudo pip install -r requirements.txt -e . - # - sudo pip install -r test_requirements.txt -e . + - name: "Python 3.8.2 OSX" + os: osx + language: generic + env: PYTHON=3.8.2' + before_install: + - brew update + - brew install openssl + - brew install zlib + - brew outdated pyenv || brew upgrade pyenv + - brew install pyenv-virtualenv + - pyenv install 3.8.2 + - export PYENV_VERSION=3.8.2 + - export PATH="/Users/travis/.pyenv/shims:${PATH}" + - pyenv virtualenv venv + - source /Users/travis/.pyenv/versions/venv/bin/activate + - python --version + install: + - python -m pip install -U pip + - sudo pip install setuptools + - sudo pip install -r requirements.txt -e . + - sudo pip install -r test_requirements.txt -e . - # - name: "Python 3.8.0 Windows" - # os: windows - # language: shell - # before_install: - # - choco install python --version 3.8.0 - # - python -m pip install --upgrade pip - # env: PATH=/c/Python38:/c/Python38/Scripts:$PATH - # install: - # - python -m pip install -U pip # maybe dont need this - # - pip install setuptools - # - pip install -r requirements.txt -e . - # - pip install -r test_requirements.txt -e . + - name: "Python 3.8.0 Windows" + os: windows + language: shell + before_install: + - choco install python --version 3.8.0 + - python -m pip install --upgrade pip + env: PATH=/c/Python38:/c/Python38/Scripts:$PATH + install: + - python -m pip install -U pip # maybe dont need this + - pip install setuptools + - pip install -r requirements.txt -e . + - pip install -r test_requirements.txt -e . - # allow_failures: - # - python: "3.8-dev" - # - python: "3.9-dev" - # - python: "pypy3" - # - os: windows + allow_failures: + - python: "3.8-dev" + - python: "3.9-dev" + - python: "pypy3" + - os: windows script: - make test - - make coverage - - make lint +# - make coverage +# - make lint -after_script: - - pip3 install codecov - - codecov +#after_script: +# - pip3 install codecov +# - codecov diff --git a/Makefile b/Makefile index aa96540..8751fb6 100644 --- a/Makefile +++ b/Makefile @@ -16,4 +16,4 @@ lint: install: pip install -r requirements.txt - pip install -e . + pip install . diff --git a/README.md b/README.md index af4fb37..3ab8b8f 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,13 @@ cerebra [![Build Status](https://travis-ci.org/czbiohub/cerebra.svg?branch=master)](https://travis-ci.org/czbiohub/cerebra) [![codecov](https://codecov.io/gh/czbiohub/cerebra/branch/master/graph/badge.svg)](https://codecov.io/gh/czbiohub/cerebra) + What is `cerebra`? ------------------------------------- -This tool allows you to quickly extract meaningful variant information from a DNA or RNA sequencing experiment. -If you're interested in learning what variants are present in your DNA/RNA samples, variant callers such as GATK [HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php) can be used to generate [variant calling format](https://samtools.github.io/hts-specs/VCFv4.2.pdf) (VCF) files following a sequencing experiment. +This tool allows you to quickly summarize and interpret VCF files. + +If you're interested in learning the full complement of genomic variants present in your DNA/RNA samples, a typical workflow might involve sequencing, followed by variant calling with a tool such as GATK [HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php), which generates [variant calling format](https://samtools.github.io/hts-specs/VCFv4.2.pdf) (VCF) files. + A VCF file looks like this: ```##fileformat=VCFv4.2 @@ -23,15 +26,15 @@ Thus drawing conclusions from VCF files remains a substantial challenge. `cerebra` provides a fast and intuitive framework for summarizing VCF records across samples. -It is comprised of three modules that do the following: +It comprises three modules that do the following: 1) remove germline variants from samples of interest 2) count the total number of variants in a given sample, on a per-gene basis - 3) report peptide-level variants for each sample + 3) report protein variants for each sample `cerebra` gets its name from the eponymous X-men [character](https://en.wikipedia.org/wiki/Cerebra), who had the ability to detect mutant individuals among the general public. -If you're working with tumor data, it might be a good idea to limit the variant search space to only known cancer variants. +If you're working with tumor data, it might be a good idea to limit the variant search space to only known, and potentially actionable, cancer variants. Therefore `cerebra` implements an optional method for restricting to variants also found in the [COSMIC](https://cancer.sanger.ac.uk/cosmic) database. This tool was developed for, but is certainly not limited to, single-cell RNA sequencing data. @@ -49,7 +52,7 @@ Another is [GATK VariantsToTable](https://software.broadinstitute.org/gatk/docum This table contains only genomic (_i.e._ DNA-level) coordinates. Often the next questions are: what are the genes associated with these variants, and what are the peptide-level effects of these variants? -`cerebra` queries a reference genome (.fa) and annotation (.gtf) to match each DNA-level variant with its associated gene, and its probable peptide-level variant. +`cerebra` queries a reference genome (.fa) and annotation (.gtf) to match each DNA-level variant with its associated gene, and its probable protein variant. `cerebra` produces the following outfile: ``` @@ -87,18 +90,21 @@ $ python Here _CCN1_ is a gene name while _A16_B000563_, _A1_B001546_, _A1_B002531,_... are RNA-seq sample IDs. `cerebra` reports variants to every gene in the genome, for every sample in a given experiment. The _ENSP*_ numbers refer to [Ensembl](https://www.ensembl.org/index.html) translation IDs -- unique identifiers that correspond to exactly one polypeptide in the Ensembl database. -The strings enclosed in parenthesis refer to the amino-acid level variants reported in that particular sample. +The strings enclosed in parentheses refer to the amino-acid level variants reported in that particular sample. For example the string `Arg209Trp` indicates that position 209 of this particular polypeptide should contain an _Arg_, but the experimental sample instead contains a _Trp_. `cerebra` adheres to HGVS sequence variant [nomenclature](https://varnomen.hgvs.org/) in reporting amino-acid variants. + Features -------- ### `germline-filter` -If the research project is centered around a "tumor/pathogenic vs control" question, then `germline-filter` is the proper starting point. -This module removes germline variants that are common between the control and the experimental tissue so as to not bias the results by including non-pathogenic variants. -The user provides a very simple metadata file that indicates which experimental samples correspond to which control samples. -The output of `germline-filter` is a set of trimmed-down VCFs. +Variant calling is often applied to the study of cancer. +If the research project is centered around a “tumor vs. normal” question, then `germline-filter` is the proper starting point. +This module removes germline variants that are common between tumor and normal samples, and thus excludes variants unlikely to be pathogenic for the cancer under study. +The user provides a very simple metadata file (see [USAGE.md](https://github.com/czbiohub/cerebra/blob/master/docs/USAGE.md)) that indicates which tumor samples correspond to which normal samples. +The output of germline-filter is a set of trimmed-down VCF files, which will be used for the next two steps. +If you do not have access to “normal” samples then proceed directly to `count-variants` or `find-peptide-variants`. ### `count-variants` This module reports the raw variant counts for every gene across every sample. @@ -106,30 +112,56 @@ The output is a CSV file that contains counts for each sample versus every gene If working with cancer variants, the user has the option of limiting the search space to variants also found in the [COSMIC](https://cancer.sanger.ac.uk/cosmic) database. ### `find-peptide-variants` -This module reports the peptide-level consequence of genomic variants. +This module reports the protein variations associated with each genomic variant. VCF records are converted to peptide-level variants, and then [ENSEMBL](https://uswest.ensembl.org/index.html) protein IDs, -in acordance to the [HGVS](https://varnomen.hgvs.org/) sequence variant nomenclature. -The output is a heirarchically ordered text file (CSV or JSON) that reports the the Ensemble protein ID and the gene associated with each variant, for each experimental sample. +in accordance to the [HGVS](https://varnomen.hgvs.org/) sequence variant nomenclature. +The output is a hierarchically ordered text file (CSV or JSON) that reports the Ensemble protein ID and the gene associated with each variant, for each experimental sample. The user again has the option of limiting the search space to variants also found in the [COSMIC](https://cancer.sanger.ac.uk/cosmic) database. +This module also has an option to report the number of variant vs. wildtype reads found at each locus. + + +Dependencies +------------ +`cerebra` depends on some (fairly standard) packages and libraries. +Before installing, it might be a good idea to make sure all of the requisite packages are installed on your system (_note:_ if installing with Docker you can skip this step). + +__MacOS Dependencies:__ +``` +sudo pip install setuptools +brew update +brew install openssl +brew install zlib +``` -Installation +__Linux Dependencies:__ +``` +sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev +``` + +As of present `cerebra` is not installable on Windows. +`cerebra` depends on the [`pysam`](https://pysam.readthedocs.io/en/latest/index.html) library (or rather, `pysam` is a dependency-of-a-dependency) and currently this library is only available on Unix-like systems. +Windows solutions like [WSL](https://docs.microsoft.com/en-us/windows/wsl/) exist for overcoming precisely this challenge, however, `cerebra` has not been tested on WSL or any other Unix-like subsystem for Windows. + + +Installation (for users) ------------ There are four different methods available to install `cerebra`. +Choose one of the following: -__With [Docker](https://hub.docker.com/r/lincolnharris/cerebra)__ +__With [Docker](https://hub.docker.com/r/lincolnharris/cerebra) (recommended)__ ``` docker pull lincolnharris/cerebra docker run -it lincolnharris/cerebra ``` _warning: this image will take up ~1Gb of space._ -__With traditional git clone and the python standard library [venv](https://docs.python.org/3/library/venv.html) module__ +__With traditional git clone and the python standard library [`venv`](https://docs.python.org/3/library/venv.html) module__ ``` git clone https://github.com/czbiohub/cerebra.git cd cerebra python3 -m venv cerebra-dev source cerebra-dev/bin/activate -pip3 install -e . +pip3 install [--user] . ``` __With traditional git clone and [conda](https://docs.conda.io/en/latest/)__ @@ -138,10 +170,12 @@ git clone https://github.com/czbiohub/cerebra.git cd cerebra conda create -n cerebra python=3.7 conda activate cerebra -pip3 install -e . +pip3 install [--user] . ``` -__From [PyPi](https://pypi.org/project/cerebra/) (system-wide installation)__ +__From [PyPi](https://pypi.org/project/cerebra/) (system-wide installation, NOT RECOMMENDED)__ +For novice users, it's generally a better idea to install packages within virtual environments. +However, `cerebra` can be installed system-wide, if you so choose. ``` pip install cerebra @@ -149,24 +183,44 @@ pip install cerebra pip install --user cerebra ``` -`cerebra` depends on some (fairly standard) packages and libraries. -If youre having trouble installing, it might be a good idea to make sure you have all of the requisite dependendices installed first (_note:_ if installing with Docker you can skip this step). -__MacOS Dependencies:__ -``` -sudo pip install setuptools -brew update -brew install openssl -brew install zlib -``` +Installation (for developers) +------------ +Here's how to set up cerebra for local development. +After installing the requisite [dependencies](https://github.com/czbiohub/cerebra/blob/joss-review-redux/README.md#dependencies): -__Linux Dependencies:__ -``` -sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev -``` +1. Fork the `cerebra` repo on GitHub: https://github.com/czbiohub/cerebra +2. Clone your fork locally: -As of present `cerebra` is not installable on Windows. -`cerebra` depends on the [`pysam`](https://pysam.readthedocs.io/en/latest/index.html) library (or rather, `pysam` is a dependency-of-a-dependency) and currently this library is only available on Unix-like systems. + $ git clone https://github.com/your-name/cerebra.git + +3. Install your local copy into a virtualenv. Using the standard library [`venv`](https://docs.python.org/3/library/venv.html) module: + + $ cd cerebra + $ python3 -m venv cerebra-dev + $ source cerebra-dev/bin/activate + $ pip install -r requirements.txt -r test_requirements.txt -e . + +4. Create a branch for local development: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass `flake8` and `pytest`: + + $ make test + $ make coverage + $ make lint + +6. Commit your changes and push your branch to GitHub: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +7. Submit a pull request through the GitHub website. +See [CONTRIBUTING.md](https://github.com/czbiohub/cerebra/blob/joss-review-redux/docs/CONTRIBUTING.md) for more. (Quickstart) Usage @@ -183,7 +237,7 @@ Options: -h, --help Show this message and exit. Commands: - germline-filter filter out common SNPs/indels between control/germline samples and samples of interest + germline-filter filter out common SNPs/indels between tumor and normal samples count-variants count total number of variants in each sample, and report on a per-gene basis find-peptide-variants report peptide-level SNPs and indels in each sample, and associated coverage ``` @@ -194,7 +248,7 @@ An example workflow might look like this: **Step 1:** ``` -cerebra germline-filter --processes 2 --control_path /path/to/control/vcfs --experimental_path /path/to/experimental/vcfs --metadata /path/to/metadata/file --outdir /path/to/filtered/vcfs +cerebra germline-filter --processes 2 --normal_path /path/to/normal/vcfs --tumor_path /path/to/tumor/vcfs --metadata /path/to/metadata/file --outdir /path/to/filtered/vcfs ``` **Step 2:** @@ -207,7 +261,8 @@ cerebra count-variants --processes 2 --cosmicdb /optional/path/to/cosmic/databas cerebra find-peptide-variants --processes 2 --cosmicdb /optional/path/to/cosmic/database --annotation /path/to/genome/annotation --genomefa /path/to/genome/fasta --report_coverage 1 --output /path/to/output/file /path/to/filtered/vcfs/* ``` -For advanced usage information, see [USAGE.md](https://github.com/czbiohub/cerebra/blob/messing-w-docs/docs/USAGE.md). +For advanced usage information, see [USAGE.md](https://github.com/czbiohub/cerebra/blob/master/docs/USAGE.md). + Authors -------- diff --git a/cerebra/count_variants.py b/cerebra/count_variants.py index 13d34e7..dc42424 100644 --- a/cerebra/count_variants.py +++ b/cerebra/count_variants.py @@ -152,12 +152,12 @@ def process_cell(path): @click.command() @click.option("--processes", default=1, - prompt="number of processes to use for computation", type=int) -@click.option("--cosmicdb", prompt="path to cosmic db file (.tsv)", + help="number of processes to use for computation", type=int) +@click.option("--cosmicdb", help="path to cosmic db file (.tsv)", required=True) -@click.option("--refgenome", prompt="path to reference genome (.gtf)", +@click.option("--refgenome", help="path to reference genome (.gtf)", required=True) -@click.option("--outfile", prompt="path to output file (.csv)", required=True) +@click.option("--outfile", help="path to output file (.csv)", required=True) @click.argument("files", required=True, nargs=-1) def count_variants(processes, cosmicdb, refgenome, outfile, files): """ count total number of variants in each sample, and report diff --git a/cerebra/find_peptide_variants.py b/cerebra/find_peptide_variants.py index 88ab41a..3da3164 100644 --- a/cerebra/find_peptide_variants.py +++ b/cerebra/find_peptide_variants.py @@ -303,9 +303,12 @@ def find_peptide_variants(num_processes, cosmicdb_path, annotation_path, print("Writing file...") csv_path = Path(output_path) - result_df.to_csv(csv_path) + if '.csv' in output_path or 'CSV' in output_path: + result_df.to_csv(csv_path) + else: + result_df.to_csv(str(csv_path) + '.csv') - json_path = str(csv_path).strip('.csv') + '.json' + json_path = str(csv_path).strip('.csv').strip('.CSV') + '.json' result_df.to_json(json_path) print("Done!") diff --git a/cerebra/germline_filter.py b/cerebra/germline_filter.py index 0a2a550..9896fd9 100644 --- a/cerebra/germline_filter.py +++ b/cerebra/germline_filter.py @@ -69,46 +69,45 @@ def write_filtered_vcf(cell_vcf_stream, germline_tree, out_stream): @click.command() @click.option("--processes", default=1, - prompt="number of processes to use for computation", type=int) -@click.option("--control_path", "control_path", - prompt="path to control/germline samples vcf files directory", + help="number of processes to use for computation", type=int) +@click.option("--normal_path", "normal_path", + help="path to 'normal' samples vcf files directory", required=True) -@click.option("--experimental_path", "experimental_path", - prompt="path to experimental samples vcf files directory", +@click.option("--tumor_path", "tumor_path", + help="path to tumor samples vcf files directory", required=True) @click.option("--metadata", "metadata_path", - prompt="path to metadata csv file", required=True) + help="path to metadata csv file", required=True) @click.option("--outdir", "out_path", - prompt="path to output vcf files directory", required=True) -def germline_filter(processes, control_path, experimental_path, metadata_path, + help="path to output vcf files directory", required=True) +def germline_filter(processes, normal_path, tumor_path, metadata_path, out_path): - """ filter out common SNPs/indels between control/germline samples and samples - of interest """ - control_path = Path(control_path) - experimental_path = Path(experimental_path) + """ filter out common SNPs/indels between tumor and normal samples """ + normal_path = Path(normal_path) + tumor_path = Path(tumor_path) metadata_path = Path(metadata_path) out_path = Path(out_path) metadata_df = pd.read_csv(metadata_path) # Create a set of all patient IDs from the metadata file. - all_germline_sample_ids = set(metadata_df["germline_sample_id"]) + all_germline_sample_ids = set(metadata_df["normal_sample_id"]) def process_patient(germline_sample_id): # Find all non-tumor bulk VCF files for the patient ID. germline_wb_vcf_paths = list( - control_path.glob(germline_sample_id + "_*_*.vcf")) + normal_path.glob(germline_sample_id + "_*_*.vcf")) # Fetch all cell IDs associated with the patient ID. - experimental_sample_ids = metadata_df.loc[ - metadata_df["germline_sample_id"] == - germline_sample_id]["experimental_sample_id"] + tumor_sample_ids = metadata_df.loc[ + metadata_df["normal_sample_id"] == + germline_sample_id]["tumor_sample_id"] # Use the cell IDs to create a list of all single-cell VCF files # for the patient. - cell_vcf_paths = [(experimental_path / experimental_sample_id). - with_suffix(".vcf") for experimental_sample_id - in experimental_sample_ids] + cell_vcf_paths = [(tumor_path / tumor_sample_id). + with_suffix(".vcf") for tumor_sample_id + in tumor_sample_ids] # Create a genome interval tree for the patient's germline bulk VCF # data. Only selects one germline VCF to avoid over-filtering for diff --git a/cerebra/protein_variant_predictor.py b/cerebra/protein_variant_predictor.py index 99fc7f5..25fa23a 100644 --- a/cerebra/protein_variant_predictor.py +++ b/cerebra/protein_variant_predictor.py @@ -1,6 +1,5 @@ from collections import defaultdict, namedtuple from itertools import tee -import re from Bio import Alphabet from Bio.Seq import Seq # need to clean this up @@ -165,7 +164,7 @@ def predict_for_vcf_record(self, vcf_record): ref = vcf_record.REF for alt in vcf_record.ALT: - + # Create a GenomePosition representing the range affected by the # ALT sequence. affected_pos = record_pos.shifted_by( diff --git a/cerebra/utils.py b/cerebra/utils.py index 8301ec3..04f985d 100644 --- a/cerebra/utils.py +++ b/cerebra/utils.py @@ -373,7 +373,7 @@ def sequence_variants_are_equivalent(seqvar_a, edit_a, edit_b = posedit_a.edit, posedit_b.edit if not type(edit_a) is type(edit_b): - print(type(edit_a), type(edit_b)) + #print(type(edit_a), type(edit_b)) # for debugging purposes return False _seqs_cmp = lambda a, b: _seqs_are_equal( diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index efbf91e..7cadfc0 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -15,8 +15,10 @@ Report bugs at https://github.com/czbiohub/cerebra/issues. If you are reporting a bug, please include: - Your operating system name and version. -- Any details about your local setup that might be helpful in troubleshooting. +- The python version you are using. +- Any other details about your local setup that might be helpful in troubleshooting. - Detailed steps to reproduce the bug. +- Screenshots or code snippets, when applicable. ### Fix Bugs @@ -24,14 +26,16 @@ Look through the GitHub issues for bugs. Anything tagged with "bug" is open to w ### Implement Features -Look through the GitHub issues for features. Anything tagged with "feature" is open to whoever wants to implement it. +Look through the GitHub issues for features/enhancement requests. Anything tagged with "enhancement" is open to whoever wants to implement it. ### Write Documentation -cerebra could always use more documentation, whether as -part of the official cerebra docs, in docstrings, or +`cerebra` could always use more documentation, whether as +part of the official `cerebra` docs, in docstrings, or even on the web in blog posts, articles, and such. +If there is a function/module that you think could use additional documentation, or is simply unclear to you, please let us know. + ### Submit Feedback The best way to send feedback is to file an issue at https://github.com/czbiohub/cerebra/issues. @@ -42,50 +46,12 @@ If you are proposing a feature: - Keep the scope as narrow as possible, to make it easier to implement. - Remember that this is a volunteer-driven project, and that contributions are welcome :) -Get Started! ------------- - -Ready to contribute? Here's how to set up cerebra for -local development. - -1. Fork the cerebra repo on GitHub: https://github.com/czbiohub/cerebra -2. Clone your fork locally: - - $ git clone https://github.com/your-name/cerebra.git - -3. Install your local copy into a virtualenv. Using the standard library [`venv`](https://docs.python.org/3/library/venv.html) module: - - $ cd cerebra - $ python3 -m venv cerebra-dev - $ source cerebra-dev/bin/activate - $ pip3 install -e . - -4. Create a branch for local development: - - $ git checkout -b name-of-your-bugfix-or-feature - - Now you can make your changes locally. - -5. When you're done making changes, check that your changes pass flake8 and the tests: - - $ make test - $ make coverage - $ make lint - -6. Commit your changes and push your branch to GitHub: - - $ git add . - $ git commit -m "Your detailed description of your changes." - $ git push origin name-of-your-bugfix-or-feature - -7. Submit a pull request through the GitHub website. - Pull Request Guidelines ----------------------- Before you submit a pull request, check that it meets these guidelines: -1. The pull request should include tests. +1. The pull request should include tests, ideally with >60% code coverage. 2. Functionality should be encapsulated within a function(s) that includes a docstring. 3. The pull request should work for Python 3.6, 3.7 and potentially 3.8. Check https://travis-ci.org/github/czbiohub/cerebra/pull_requests and make sure that the tests pass diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 4030336..207aed8 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -1,21 +1,48 @@ -Installation +Installation Instructions +============ + +Dependencies +------------ +`cerebra` depends on some (fairly standard) packages and libraries. +Before installing it might be a good idea to make sure all of the requisite packages are installed on your system (_note:_ if installing with Docker you can skip this step). + +__MacOS Dependencies:__ +``` +sudo pip install setuptools +brew update +brew install openssl +brew install zlib +``` + +__Linux Dependencies:__ +``` +sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev +``` + +As of present `cerebra` is not installable on Windows. +`cerebra` depends on the [`pysam`](https://pysam.readthedocs.io/en/latest/index.html) library (or rather, `pysam` is a dependency-of-a-dependency) and currently this library is only available on Unix-like systems. +Windows solutions like [WSL](https://docs.microsoft.com/en-us/windows/wsl/) exist for overcoming precisely this challange, however, `cerebra` has not been tested on WSL or any other Unix-like subsystem for Windows. + + +Installation (for users) ------------ There are four different methods available to install `cerebra`. +Choose one of the following: -__With [Docker](https://hub.docker.com/r/lincolnharris/cerebra)__ +__With [Docker](https://hub.docker.com/r/lincolnharris/cerebra) (recommended)__ ``` docker pull lincolnharris/cerebra docker run -it lincolnharris/cerebra -``` -_warning: this image will take up ~1Gb of space._ +``` +_warning: this image will take up ~1Gb of space._ -__With traditional git clone and the python standard library [venv](https://docs.python.org/3/library/venv.html) module__ +__With traditional git clone and the python standard library [`venv`](https://docs.python.org/3/library/venv.html) module__ ``` git clone https://github.com/czbiohub/cerebra.git cd cerebra python3 -m venv cerebra-dev source cerebra-dev/bin/activate -pip3 install -e . +pip3 install [--user] . ``` __With traditional git clone and [conda](https://docs.conda.io/en/latest/)__ @@ -24,10 +51,12 @@ git clone https://github.com/czbiohub/cerebra.git cd cerebra conda create -n cerebra python=3.7 conda activate cerebra -pip3 install -e . +pip3 install [--user] . ``` -__From [PyPi](https://pypi.org/project/cerebra/) (system-wide installation)__ +__From [PyPi](https://pypi.org/project/cerebra/)(system-wide installation, NOT RECOMMENDED)__ +For novice users, its generally a better idea to install packages within virtual environments. +However, `cerebra` can be installed system-wide, if you so choose. ``` pip install cerebra @@ -35,19 +64,41 @@ pip install cerebra pip install --user cerebra ``` -`cerebra` depends on some (fairly standard) packages and libraries. If youre having trouble installing, it might be a good idea to make sure you have all of the requisite dependendices installed first (_note:_ if installing with Docker you can skip this step). -__MacOS Dependencies:__ -``` -sudo pip install setuptools -brew update -brew install openssl -brew install zlib -``` +Installation (for developers) +------------ +Here's how to set up cerebra for local development. +After installing the requisite [dependencies](https://github.com/czbiohub/cerebra/blob/joss-review-redux/docs/INSTALL.md#dependencies): -__Linux Dependencies:__ -``` -sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev -``` +1. Fork the `cerebra` repo on GitHub: https://github.com/czbiohub/cerebra +2. Clone your fork locally: + + $ git clone https://github.com/your-name/cerebra.git + +3. Install your local copy into a virtualenv. Using the standard library [`venv`](https://docs.python.org/3/library/venv.html) module: + + $ cd cerebra + $ python3 -m venv cerebra-dev + $ source cerebra-dev/bin/activate + $ pip install -r requirements.txt -r test_requirements.txt -e . + +4. Create a branch for local development: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass flake8 and the tests: + + $ make test + $ make coverage + $ make lint + +6. Commit your changes and push your branch to GitHub: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature -As of present `cerebra` is not installable on Windows. `cerebra` depends on the [`pysam`](https://pysam.readthedocs.io/en/latest/index.html) library -- or rather, `pysam` is a dependency-of-a-dependency -- and currently this library is only available on Unix-like systems. +7. Submit a pull request through the GitHub website. +See [CONTRIBUTING.md](https://github.com/czbiohub/cerebra/blob/joss-review-redux/docs/CONTRIBUTING.md) for more. diff --git a/docs/USAGE.md b/docs/USAGE.md index 0bb5c2e..3e4bbca 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -8,11 +8,11 @@ Before starting it might be a good idea to trim down your VCF files to only the ## Filtering germline variants -If you have access to control tissue, then `germline-filter` is a good place to start. -All you have to do is provide a simple metadata file that links each experimental sample to its corresponding control sample. +If you have access to germline or "normal" tissue, then `germline-filter` is a good place to start. +All you have to do is provide a simple metadata file that links each tumor sample to its corresponding normal sample. For example: ``` -experimental_sample_id,germline_sample_id +tumor_sample_id,normal_sample_id sample1,gl_sample1 sample2,gl_sample1 sample3,gl_sample2 @@ -22,7 +22,7 @@ sample5,gl_sample2 Once you have made this metadata file you're ready to run `germline-filter.` An example command line: ``` -cerebra germline-filter --processes 2 --control_path /path/to/control/vcfs --experimental_path /path/to/experimental/vcfs --metadata /path/to/metadata/file --outdir /path/to/filtered/vcfs +cerebra germline-filter --processes 2 --normal_path /path/to/normal/vcfs --tumor_path /path/to/tumor/vcfs --metadata /path/to/metadata/file --outdir /path/to/filtered/vcfs ``` This will create a new directory (`/path/to/filtered/vcfs/`) that contains a set of entirely new VCFs. @@ -30,7 +30,7 @@ This will create a new directory (`/path/to/filtered/vcfs/`) that contains a set ## Counting variants The module `count-variants` module can be run after `germline-filter`, on the new vcfs contained in `/path/to/filtered/vcfs/`. -However, `germline-filter` is entirely optional -- if you dont have access to germline or control samples, `count-variants` is the place to start. +However, `germline-filter` is entirely optional -- if you dont have access to germline or "normal" samples, `count-variants` is the place to start. An example command line: ``` cerebra count-variants --processes 2 --cosmicdb /optional/path/to/cosmic/database --refgenome /path/to/genome/annotation --outfile /path/to/output/file /path/to/filtered/vcfs/* @@ -40,13 +40,31 @@ NOTE that the cosmic database is also optional. If you'd like you can download o ## Finding peptide variants -Like `count-variants`, `find-peptide-variants` is a standalone module. You can run it on the VCFs generated by `germline-filter` or on unfiltered VCFs. Also like `count-variants`, this module gives you the option of filtering through a cosmic database. +Like `count-variants`, `find-peptide-variants` is a standalone module. +You can run it on the VCFs generated by `germline-filter` or on unfiltered VCFs. +Also like `count-variants`, this module gives you the option of filtering through a cosmic database. An example command line: ``` cerebra find-peptide-variants --processes 2 --cosmicdb /optional/path/to/cosmic/database --annotation /path/to/genome/annotation --genomefa /path/to/genome/fasta --report_coverage 1 --output /path/to/output/file /path/to/filtered/vcfs/* ``` -The `report_coverage` option will report counts for both variant and wildtype reads at all variant loci. If indicated this option will report counts for both variant and wildtype reads at all variant loci. We reasoned that variants with a high degree of read support are less likely to be false positives. This option is designed to give the user more confidence in individual variant calls. +`report_coverage` is a BOOLEAN option will report counts for both variant and wildtype reads at all variant loci. +`--report_coverage 1` turns this option on, while `--report_coverage 0` turns it off. +We reasoned that variants with a high degree of read support are less likely to be false positives. +This option is designed to give the user more confidence in individual variant calls. + +For example, when run on the pre-packaged test VCF set (_cerebra/tests/data/test_find_peptide_variants/vcf_), +`cerebra find-peptide-variants --report_coverage 1` should yield the following (partial) entry: + +``` +A1,['ENSP00000395243.3:p.(Leu813delinsArgTrp),[2:0]', 'ENSP00000415559.1:p.(Leu813delinsArgTrp),[2:0]'], +``` +This tells us that the sample _A1_ contains likely variants in the Ensembl peptide IDs _ENSP00000395243.3_ and _ENSP00000415559.1_. +Both variants are insertions of _ArgTrp_ in place of _Leu_ at the 813th amino acid. + +The [_x_:_y_] string represents the absolute number of variant and wildtype reads at that loci. +Thus [2:0] means 2 variant reads and 0 wildtype reads were found at each of these loci. +A coverage string in the format of [_x_:_y_:_z_] would indicate there are two variant alleles at a given loci, _x_ and _y_, in addition to wildtype, _z_. ## Testing @@ -57,4 +75,4 @@ Now you should be able to run: If you've installed `cerebra` in a virtual environment make sure the environment is active. Confirm that all tests have passed. -If otherwise, feel free to submit an [issue report](https://github.com/czbiohub/cerebra/blob/messing-w-docs/docs/CONTRIBUTING.md). +If otherwise, feel free to submit an [issue report](https://github.com/czbiohub/cerebra/blob/master/docs/CONTRIBUTING.md). diff --git a/paper/paper.bib b/paper/paper.bib index dfd22fd..3bedba0 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -95,12 +95,12 @@ @article{Huang:2017 doi={10.1186/s13059-017-1248-5} } -@article{Maynard:2019, - title={{Heterogeneity and targeted therapy-induced adaptations in lung cancer revealed by single-cell RNA-seq}}, +@article{Maynard:2020, + title={{Therapy-Induced Evolution of Human Lung Cancer Revealed by Single-Cell RNA Sequencing}}, author={Maynard, A and McCoach, C and Rotow, JK and Harris, L and Haderk, F and Kerr, L and others}, - journal={bioRxiv}, - year={2019}, - doi={10.1101/2019.12.08.868828} + journal={Cell}, + year={2020}, + doi={10.1016/j.cell.2020.07.017} } @inbook{ncbi, diff --git a/paper/paper.md b/paper/paper.md index 2efd8f3..55e0359 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -58,8 +58,7 @@ To address the unmet need for high-throughput VCF summary tools, we introduce `c ## Functionality -`cerebra` comprises three modules: i) `germline-filter` removes variants that are common between control/germline samples -and samples of interest, ii) `count-variants` reports total number of variants in each sample, and iii) `find-peptide-variants` reports the likely protein variants in each sample. +`cerebra` comprises three modules: i) `germline-filter` is intended for working with cancer data and removes variants that are common between tumor and normal samples, ii) `count-variants` reports total number of variants in each sample, and iii) `find-peptide-variants` reports the likely protein variants in each sample. Here we use _variant_ to refer to single nucleotide polymorphisms (SNPs) and short insertions/deletions. `cerebra` is not capable of reporting larger structural variants such as copy number variations and chromosomal rearrangements. @@ -84,14 +83,15 @@ We construct a genome interval tree from a genome annotation (.gtf) and a refere ## `germline-filter` -If the research project is centered around a "tumor/pathogenic vs control" question, then `germline-filter` is the proper starting point. -This module removes germline variants that are common between the control and the experimental tissue so as to not bias the results by including non-pathogenic variants. -The user provides a very simple metadata file (see [USAGE.md](https://github.com/czbiohub/cerebra/blob/messing-w-docs/docs/USAGE.md)) that indicates which experimental samples correspond to which control samples. -Using the [vcfpy](https://pypi.org/project/vcfpy/) library we quickly identify shared variants across control/experimental matched VCF files, then write new VCFs that contain only the unique variants [@vcfpy]. -These steps are performed by a [subprocess pool](https://pypi.org/project/pathos/) so that we can process multiple discreet chunks of input at the same time. +Variant calling is often applied to the study of cancer. +If the research project is centered around a "tumor vs. normal" question, then `germline-filter` is the proper starting point. +This module removes germline variants that are common between tumor and normal samples, and thus excludes variants unlikely to be pathogenic for the cancer under study. +The user provides a very simple metadata file (see [USAGE.md](https://github.com/czbiohub/cerebra/blob/master/docs/USAGE.md)) that indicates which tumor samples correspond to which normal samples. +Using the [vcfpy](https://pypi.org/project/vcfpy/) library we quickly identify shared variants across tumor/normal matched VCF files, then write new VCFs that contain only the unique variants [@vcfpy]. +These steps are performed by a [subprocess pool](https://pypi.org/project/pathos/) so that we can process multiple discrete chunks of input at the same time. The output of `germline-filter` is a set of trimmed-down VCF files, which will be used for the next two steps. -If you do not have access to "control" tissue then proceed directly to `count-variants` or `find-peptide-variants`. +If you do not have access to "normal" samples then proceed directly to `count-variants` or `find-peptide-variants`. ## `count-variants` @@ -118,7 +118,7 @@ Protein variants are converted to [ENSEMBL](https://uswest.ensembl.org/index.htm The output is a hierarchically ordered text file (CSV or JSON) that reports the the ENSEMBL protein ID and the gene associated with each variant, for each experimental sample. Variant callers are known to produce a great deal of false positives, especially when applied to single-cell RNA-seq data [@Enge:2017]. -To address this concern, we have included the `--report_coverage` option. +To address this concern, we have included the `coverage` option. If indicated this option will report counts for both variant and wildtype reads at all variant loci. We reasoned that variants with a high degree of read support are less likely to be false positives. This option is designed to give the user more confidence in individual variant calls. @@ -131,7 +131,7 @@ It is possible the sample does not actually express both of these isoforms, howe ![For a given mutational event, `cerebra` reports ALL potentially affected isoforms.\label{splice}](fig2.jpg) -To assess performance of `find-peptide-variants` we obtained VCFs from a single-cell RNA-seq study conducted on lung adenocarcinoma patient samples [@Maynard:2019]. +To assess performance of `find-peptide-variants` we obtained VCFs from a single-cell RNA-seq study conducted on lung adenocarcinoma patient samples [@Maynard:2020]. These VCFs were produced with STAR (alignment) and GATK HaplotypeCaller (variant calling), and are on the order of megabytes, typical of a single-cell RNA-seq experiment. `cerebra` was run on standard hardware (MacBook Pro, 2.5GHz quad-core processor, 16 GB RAM). As show in \autoref{runtime} `cerebra` processed a set of 100 VCF files in approximately 34 minutes. @@ -147,10 +147,10 @@ Also of note is that `cerebra`'s search operations take advantage of multiproces RNA/DNA sequencing paired with fast and accurate summarizing of variants is often crucial to understanding the biology of an experimental system. We present a tool that can be used to quickly summarize the variant calls contained within a large set of VCF files. -As sequencing costs continue to drop, large-scale variant calling will become accessible to more members of the community, and summary tools like `cerebra` will become increasingly important. +As sequencing costs continue to drop, large-scale variant calling will become more accessible, and summary tools like `cerebra` will become essential for drawing meaningful conclusions in a reasonable time frame. Our tool offers the advantages of parallel processing and a single, easy-to-interpret output file (CSV or JSON). -`cerebra` is already enabling research, see [@Maynard:2019], a study that examines the tumor microenvironment of late-stage drug-resistant carcinomas with single-cell RNA-sequencing. +`cerebra` is already enabling research, see [@Maynard:2020], a study that examines the tumor microenvironment of late-stage drug-resistant carcinomas with single-cell RNA-sequencing. Understanding the mutational landscape of individual tumors was essential to this study, and given the sheer volume of VCF records, would not have been possible without `cerebra`. We hope that `cerebra` can provide an easy-to-use framework for future studies in the same vein. @@ -167,7 +167,7 @@ Please contact `ljharris018@gmail.com` `cerebra` is written in Python 3. Code and detailed installation instructions can be found at https://github.com/czbiohub/cerebra. -In addition `cerebra` can be found on [PyPi](https://pypi.org/project/cerebra/) and [Dockerhub](https://hub.docker.com/r/lincolnharris/cerebra). +In addition, `cerebra` can be found on [PyPi](https://pypi.org/project/cerebra/) and [Dockerhub](https://hub.docker.com/r/lincolnharris/cerebra). ## References diff --git a/setup.py b/setup.py index be9860f..7f19a27 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name='cerebra', - version='1.1.1', + version='1.1.2', description="finds mutants in your scRNA-seq experiment", long_description=long_description, long_description_content_type='text/markdown', @@ -42,7 +42,7 @@ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', "Intended Audience :: Science/Research", - "Intended Audience :: Information Technology", + "Topic :: Scientific/Engineering :: Bio-Informatics", 'License :: OSI Approved :: MIT License', 'Natural Language :: English', "Operating System :: Unix", diff --git a/tests/data/test_germline_filter/meta.csv b/tests/data/test_germline_filter/meta.csv index b8513b8..4fa3ce1 100644 --- a/tests/data/test_germline_filter/meta.csv +++ b/tests/data/test_germline_filter/meta.csv @@ -1,3 +1,3 @@ -experimental_sample_id,germline_sample_id +tumor_sample_id,normal_sample_id A1,A1_GL1 A1,A1_GL2 \ No newline at end of file diff --git a/tests/data/test_germline_filter/meta_dummy.csv b/tests/data/test_germline_filter/meta_dummy.csv index 0081980..3231c67 100644 --- a/tests/data/test_germline_filter/meta_dummy.csv +++ b/tests/data/test_germline_filter/meta_dummy.csv @@ -1,4 +1,4 @@ -experimental_sample_id,germline_sample_id +tumor_sample_id,normal_sample_id A1,A1_GL1 A1,A1_GL2 foobar,A1_GL3 \ No newline at end of file diff --git a/tests/data/test_germline_filter/germline/A1_GL1.vcf b/tests/data/test_germline_filter/normal/A1_GL1.vcf similarity index 100% rename from tests/data/test_germline_filter/germline/A1_GL1.vcf rename to tests/data/test_germline_filter/normal/A1_GL1.vcf diff --git a/tests/data/test_germline_filter/germline/A1_GL2.vcf b/tests/data/test_germline_filter/normal/A1_GL2.vcf similarity index 100% rename from tests/data/test_germline_filter/germline/A1_GL2.vcf rename to tests/data/test_germline_filter/normal/A1_GL2.vcf diff --git a/tests/data/test_germline_filter/experimental/A1.vcf b/tests/data/test_germline_filter/tumor/A1.vcf similarity index 100% rename from tests/data/test_germline_filter/experimental/A1.vcf rename to tests/data/test_germline_filter/tumor/A1.vcf diff --git a/tests/test_germline_filter.py b/tests/test_germline_filter.py index 6d36ed2..3927cf2 100644 --- a/tests/test_germline_filter.py +++ b/tests/test_germline_filter.py @@ -18,11 +18,11 @@ def setUpClass(self): __file__).parent / "data" / "test_germline_filter" self.vcf_path = Path( - __file__).parent / "data" / "test_germline_filter" / "experimental" + __file__).parent / "data" / "test_germline_filter" / "tumor" self.filt_path = Path( __file__).parent / "data" / "test_germline_filter" / "gl_out" self.germ_path = Path( - __file__).parent / "data" / "test_germline_filter" / "germline" + __file__).parent / "data" / "test_germline_filter" / "normal" def test_germline_filter(self): @@ -74,15 +74,15 @@ def test_basic(self): data_path = os.path.abspath(__file__ + '/../' + 'data/test_germline_filter/') - gl_path = data_path + '/germline/' - experimental_path = data_path + '/experimental/' + gl_path = data_path + '/normal/' + experimental_path = data_path + '/tumor/' meta_path = data_path + '/meta.csv' out_path = data_path + '/gl_out/' runner = CliRunner() result = runner.invoke(germline_filter, ["--processes", 1, - "--control_path", gl_path, - "--experimental_path", experimental_path, + "--normal_path", gl_path, + "--tumor_path", experimental_path, "--metadata", meta_path, "--outdir", out_path]) @@ -98,15 +98,15 @@ def test_failure_multi(self): data_path = os.path.abspath(__file__ + '/../' + 'data/test_germline_filter/') - gl_path = data_path + '/germline/' - experimental_path = data_path + '/experimental/' + gl_path = data_path + '/normal/' + experimental_path = data_path + '/tumor/' meta_path = data_path + '/meta_dummy.csv' out_path = data_path + '/gl_out/' runner = CliRunner() result = runner.invoke(germline_filter, ["--processes", 2, - "--control_path", gl_path, - "--experimental_path", experimental_path, + "--normal_path", gl_path, + "--tumor_path", experimental_path, "--metadata", meta_path, "--outdir", out_path])