From 37c49b3187d1d7ad272cfb9fe7e80dc2413873e7 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:27:15 +0000 Subject: [PATCH 01/40] add release github action workflow --- .github/workflows/release.yml | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..6be0498f --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,46 @@ +name: Release + +on: + push: + branches: + - master + +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - uses: GoogleCloudPlatform/release-please-action@v3 + id: release + with: + release-type: python + package-name: trtools + + - uses: actions/checkout@v3 + if: ${{ steps.release.outputs.release_created }} + with: + fetch-depth: 2 + + - name: Set up Python + if: ${{ steps.release.outputs.release_created }} + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Upgrade pip + if: ${{ steps.release.outputs.release_created }} + run: | + pip install --upgrade pip + pip --version + + - name: Build package + if: ${{ steps.release.outputs.release_created }} + run: | + python -m build + + - name: Publish package on PyPI + if: ${{ steps.release.outputs.release_created }} + uses: pypa/gh-action-pypi-publish@v1.5.0 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} From 8929a1bfc40e9ff8112d7f674831cd23bf64c9e3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:40:15 +0000 Subject: [PATCH 02/40] ensure PR titles use conventional commits spec --- .github/workflows/conventional-prs.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/conventional-prs.yml diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml new file mode 100644 index 00000000..ecdcf9b0 --- /dev/null +++ b/.github/workflows/conventional-prs.yml @@ -0,0 +1,23 @@ +name: Title +on: + pull_request_target: + types: + - opened + - reopened + - edited + - synchronize + workflow_call: + +permissions: + contents: read + +jobs: + title-format: + permissions: + pull-requests: read # for amannn/action-semantic-pull-request to analyze PRs + statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5.4.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 1605bf14ccf8d624caabcf8ab4530b6b8966de96 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:40:44 +0000 Subject: [PATCH 03/40] update publishing instructions --- PUBLISHING.rst | 47 ++++------------------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index fdf9e6f3..ede6a6bb 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -16,7 +16,6 @@ We use a simplified version of to maintain and publish trtools. We use the master branch as the default branch with the latest stable codebase. The builds from this branch are distributed to PyPI and conda. -The develop branch contains new features that have yet to make their way into master. New Dependencies ---------------- @@ -27,36 +26,14 @@ If you've added dependencies to trtools or its tests, those dependencies should TRTool's Read The Docs webpage. * the appropriate section of the bioconda recipe (see below) - Publishing Steps ---------------- -Once changes have been made to develop that are ready to be published, first choose the new version number according to `semantic versioning `_. Then set up the environment you're going to publish TRTools from: - -#. Create a clean environment. -#. Install setuptools with version >= 40.8.0 -#. Additionally, install ``pytest``, ``wheel``, ``build``, and ``twine`` -#. Clone the `trtools repo `_ -#. Check out the develop branch -#. Run :code:`pip install --upgrade pip && pip install -e .` - -Then go through the steps of merging the changes into the master branch: - -#. Run :code:`pytest` and make sure all the tests pass. Then run :code:`./test/cmdline_tests.sh` and make sure those tests pass. -#. Change the 'Unreleased Changes' section of :code:`RELEASE_NOTES.rst` to the new version number. -#. Check if any changes have been made that have not yet been documented in the release notes. If so, document them. -#. Submit a pull request from develop into master on the github webiste. -#. If the code review checks pass, merge the pull request. -#. Tag the merge commit with the package version in vX.Y.Z format. (For more details on tagging, see `below`) +To publish a new version of trtools: -Then go through the steps of publishing the changed code to PyPI: - -1. :code:`cd` into the root of your clone of the trtools repo, checkout master and pull the latest change. Note that the most recent commit *must* be tagged. -2. Run :code:`rm -rf build dist *.egg-info` to make sure all previous build artifacts are removed -3. Run :code:`python -m build` to build the package with the version number you just tagged. (Note: you might need to install ``build`` first.) -5. Run :code:`twine upload dist/*` to upload the distribution to PyPI - -Lastly, the change needs to be published to bioconda. +1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot +2. List an admin on our repository (currently: @aryarm) as a reviewer of the PR and ask them to merge it +3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number and the PyPI reference. If there are no new dependencies, no changes to the build, @@ -98,19 +75,3 @@ Possible Issues: * bioconda packages should not include large test data files. If the dist/trtools-.tar.gz file contains such files, you'll need to modify the MANIFEST.in file to exclude them, fix the test_trtools.sh script to download them manually and point pytest to them, confirm the tests run in a :code:`conda build` and then restart the publishing process. (This should not happen if new test files are just put in :code:`trtools/testsupport/sample_vcfs` or :code:`trtools/testsupport/sample_regions`) - -Git Tagging ------------ - -Git tags are used to mark specific commits with certain names (i.e. v1.2.0). -Please note that tags are assigned to commits, not branches. -You can tag a commit in two different ways. - -#. Command line: - -.. code-block:: bash - - git tag -a vX.Y.Z -m vX.Y.Z - git push --tags - -2. Web interface: you can go to the releases page of the repository and create a new release. From cf1f7b7e9dc19afee108d3ebc2c16a2c130e4049 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 23 Nov 2023 00:01:31 +0000 Subject: [PATCH 04/40] also make sure to install build --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6be0498f..9bb4d24d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,6 +31,7 @@ jobs: if: ${{ steps.release.outputs.release_created }} run: | pip install --upgrade pip + pip install build pip --version - name: Build package From b91f6095be61c743da7f6e12e9a18ff9dc9cd351 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:39:45 -0800 Subject: [PATCH 05/40] convert RELEASE_NOTES.rst to CHANGELOG.md --- CHANGELOG.md | 293 ++++++++++++++++++++++++++++++++++++++++++++++ RELEASE_NOTES.rst | 265 ----------------------------------------- 2 files changed, 293 insertions(+), 265 deletions(-) create mode 100644 CHANGELOG.md delete mode 100644 RELEASE_NOTES.rst diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..88f62f39 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,293 @@ +# Changelog + +## [5.1.0](https://github.com/gymrek-lab/TRTools/compare/v5.0.2...v5.1.0) (2023-11-22) + +### Features + +* Added prancSTR for mosaicism detection +* Added simTR for simulating NGS reads with stutter errors at TRs + +## 5.0.2 + +### Bug fixes + +* MergeSTR now will no longer sometimes emit an alternate allele + identical to the ref allele when dealing with flanking base pairs. + +## 5.0.1 + +### Bug fixes + +* Remove outdated call in qcSTR to `np.float()` + +## 5.0.0 + +### Features + +* associaTR has been released! + +Current limitations: + +* Does not support binary phenotypes yet +* Does not support producing data for plotting individual loci yet +* Values in the output file aside from the p-values, coefficients and + standard errors have not been fully tested + +## 4.2.1 + +### Bug fixes + +* Fix bioconda build + +## 4.2.0 + +### Features + +* TRTools can now read VCFs produced by Beagle imputation. + +### Bug fixes + +* MergeSTR now successfully merges files containing multiple + chromosomes instead of emitting a \'stuck in infinite loop\' message + and crashing +* MergeSTR no longer crashes if run with the \--verbose flag and the + last position in each of the VCFs being merged isn\'t identical. +* StatSTR now errors out if any of the files listed in \--samples + contain no samples that are present in the input VCF. +* DumpSTR now reads call depth from the LC format field when the DP + format field is not present. This was intended previously but was + not happening. + +### Documentation + +* Clarified in PUBLISHING.rst how to handle dependencies and how to + publish to bioconda. +* requirements.txt was unneeded, so delete it and remove the reference + to it from PUBLISHING.rst + +## 4.1.0 + +### Functionality Changes + +* MergeSTR: Flanking basepairs are now removed from HipSTR records + before merging. In particular, records with different flank lengths + but the same repeat section will now merge instead of being marked + as incompatible. +* CompareSTR: the tool now only compares records that start and end at + the same position. If a partial overlap in records is detected, the + program will output a warning to the user. This warning contains IDs + of the records and their positions. + +### Misc + +* mergeutils: function GetMinHarmonizedRecords was transformed into + GetIncrementAndComparability, which allows the caller to define + custom predicate that decides whether records are comparable. + +## 4.0.2 + +### Bug fixes + +* fixed record + positions being compared twice +* CompareSTR: Decision on which records are comparable is now based on + data from harmonized TRRecords, and not from the records directly + from VCF readers. Thanks to this, HipSTR records which have + different starting positions, but position of their repeat is at the + same position are compared correctly (harmonization step removes + this difference). +* MergeSTR failed on mixed ploidy samples (i.e. chrX). Fix one such + bug. Note: none of the tools are fully tested for chrX even with + this fix. + +## 4.0.1 + +### Bug fixes + +* Fix + HipstrMinSuppReads filter when there are called samples but none + have ALLREADS + +## 4.0.0 + +### Features + +* Underlying libraries now use cyvcf2 instead of PyVCF for VCF + parsing. This makes both the underlying VCF reading code and the + TRTools code significantly faster and more memory efficient. For + instance, the loading of VCFs into memory is now \> 15x faster for + VCFs with many samples. Some tools will still need further updates + to be usable for large datasets, but those updates should now be + possible and much easier. (e.g. emitting progress reports to stdout + as needed, flags to disable computations that cannot be done at such + scale) +* DumpSTR has a new flag \--zip to produce a bgzipped and + tabix-indexed output VCF +* StatSTR now can calculate the entropy of the allele distribution at + each locus with the \--entropy flag +* The [TRTools documentation + website](https://trtools.readthedocs.io/en/latest/) now displays the + release notes. + +### Command line interface changes + +* StatSTR\'s \--region option now requires the input VCF to be + bgzipped and tabix indexed. +* If DumpSTR is used on an input VCF with unexpectedly typed INFO + fields \'AC\', \'REFAC\', \'HET\', \'HWEP\', \'HRUN\' or FORMAT + field \'FILTER\', it now errors out and asks you to rename those + fields before rerunning DumpSTR. (If they already exist but have the + correct number and type DumpSTR will overwrite them and issue a + warning in case that was not intended) +* CompareSTR\'s docs used to claim that when comparing alleles from + different callers those callers must use the same allele notation + (e.g. implying that ExpansionHunter\'s \'\\' and GangSTR\'s + \'ACACACACACAC\' notation would always mismatch). That statement was + never true for length based comparisons - CompareSTR has always been + able to do length based comparisons regardless of notation. The + incorrect claim has been removed from CompareSTR\'s docs. +* CompareSTR\'s docs now explicitly tell the user to order phased + calls to prevent spurious mismatching. If phasing is not desired, + use \--ignore-phasing +* CompareSTR will now error if at a single locus both files do not + have either all unphased calls, or all phased calls. If phasing is + not desired, use \--ignore-phasing + +### Output changes + +* DumpSTR call level filters now have the value of the filter and the + value which triggered the filter appended to the filter name in the + FILTER format field. (e.g. GangSTRCallMinDepth20_12 because the + field had a depth of 12 and that\'s lower than the required min + depth of 20) +* DumpSTR locus filter HRUN is now written as HRUN and not HRUN0 in + the samplog output file +* When running DumpSTR, loci where all the calls were either already + nocalls or were filtered by call-level filters before the + locus-level filters were run are now marked as + \'NO_CALLS_REMAINING\' instead of \'PASS\'. +* When DumpSTR filters a call and replaces each of its format fields + with the no call \'.\', fields with more than one value are now + represented correctly. For example, for 2 values \'.,.\' is used + rather than just a single \'.\' +* MergeSTR header lines are now copied over from the input VCFs + instead of only copying over a few recognized fields (e.g. ID and + Length were the only contig fields that were previously retained, + but URL wouldn\'t be) +* MergeSTR output alt alleles for eh and popstr are now ordered by + length. MergeSTR output alt alleles for advntr, gangstr and hipstr, + when there are multiple alt alleles of the same length, are now + ordered alphabetically instead of arbitrarily. +* CompareSTR no longer outputs the file \-callcompare.tab - + the existence of that file was never documented, and besides, all + its information could be seen more easily simply by looking at the + input VCFs +* In CompareSTR\'s overall.tab file, the ranges in the format columns + are now written \[a,b) or \[a,b\] instead of a-b +* CompareSTR\'s locuscompare.tab file now outputs loci in the order + they were encountered in the input VCfs as opposed to an arbitrary + order +* The \'sample\' column in CompareSTR\'s locuscompare.tab file has + been renamed to \'numcalls\' to match the other two tab files. + +### Python interface changes + +* The trtools.utils.tr_harmonizer module has been reworked to use + cyvcf2, and in doing so a large portion of its interface has changed + in small ways. +* The big conceptual change is that instead of repeatedly calling a + method on a TRRecord object like GetStringGenotype for each sample + in the VCF, instead you call the new corresponding method + GetStringGenotypes once, and it returns a numpy array of values + where the first axis of the array ranges over the samples. +* The way missing calls and samples with lower than maximal ploidy are + handled is now tested and documented. These representations of these + genotypes have been aligned with cyvcf2\'s standards. For more info, + see the docs of the index, length and string genotype getter + methods. + +### Bug fixes + +* The AC, REFAC fields that DumpSTR output used to be incorrect, are + now correct +* If you specify \--drop-filtered DumpSTR will no longer set all + values in the output .loclog.tab file to zero and instead set them + to their proper values (which are the same as if you had not + specified \--drop-filtered) +* DumpSTR now correctly adds ##FILTER=\ to the header line +* DumpSTR now no longer says HipSTRCallFlankIndels is applied to + nocalls +* MergeSTR now outputs the same phase as the input files instead of + always outputting unphased data +* MergeSTR now correctly outputs Number=A, G or R (number of entries + in this field equal to number of alternate alleles at this locus, + the number of alleles including the ref, or the number of unique + polyploid genotypes) correctly in INFO and FORMAT fields instead of + outputing Number=-1, -2 or -3 +* CompareSTR claimed it was outputting the square (Pearson) + correlation coefficient but was actually outputting the raw + (unsquared) correlation coefficient. It is now outputting the + squared coefficient as documented. +* CompareSTR now correctly compares unphased calls without regard to + order in the VCF (e.g. \'AAAA/AAA\' now matches against + \'AAA/AAAA\') +* CompareSTR\'s docs claimed the bubble plots axes were measured in + basepair difference from the reference, but they were actually + measured in number of repeats different from the reference. The + behavior has not been changed and the claim has been updated to + match the behavior. +* When using binned format fields in CompareSTR where the range of + values did not evenly divide into the requested binsize, the highest + valued bin used to always be the same size as all the other bins and + include values over the limit specified by the user. Now it caps at + that maximum. E.g. binsizes 0:210:50 used to create the bins + \[0,50), \[50,100), \[100,150), \[150, 200), \[200, 250) and now + create the bins \[0,50), \[50,100), \[100,150), \[150, 200), \[200, + 210\] +* When using binned format fields in CompareSTR where the range of + values evenly divided into the requested binsize, loci which + obtained the requested maximum would be excluded. They are now + included. E.g. binsizes 0:200:50 used to create the bins \[0,50), + \[50,100), \[100,150), \[150, 200) and samples with value 200 would + not fall into any bin. This now creates the bins \[0,50), \[50,100), + \[100,150), \[150, 200\] and samples with value 200 fall into the + last bin + +### Quality of life improvements + +* StatSTR, when printing output to a file, now prints timing + diagnostics to stdout. +* DumpSTR will fail faster if output directory does not exist +* When encountering issues with identifying the caller type for each + input VCF, MergeSTR now prints an error and gracefully returns + instead of dying to an uncaught exception +* MergeSTR incompatible INFO field warnings now specify which locus + has an incompatible field + +### Regressions + +* The \--gangstr-require-support filter has been disabled. + +### Outstanding bugs + +* The dumpSTR ExpansionHunter ADFL ADIR ADSP filters have never worked +* DumpSTR remains untested on ExpansionHunter filters and files +* DumpSTR remains untested on loci with variable ploidy and/or + partially genotyped samples (e.g. .\|2) +* When running CompareSTR with the \--stratify options where + \--stratify-file is either not specified or is explicitly set to + zero, for each format field all calls where the value of that field + in vcf1 does not fall into the same bin as the value of that field + in vcf2 are silently not compared for that format field. The correct + behavior here is probably to create paired bins based on a range of + values from vcf1 and a range from vcf2. Regardless, the behavior + here should be documented. + +## 3.0.3 + +### Bug fixes + +* Fixed a spot where qcSTR would crash because we passed Pandas a set + instead of a list +* MergeSTR now writes out the header for the GT FORMAT field diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst deleted file mode 100644 index fcfbff24..00000000 --- a/RELEASE_NOTES.rst +++ /dev/null @@ -1,265 +0,0 @@ -5.1.0 ------ - -New features: - -* Added prancSTR for mosaicism detection -* Added simTR for simulating NGS reads with stutter errors at TRs - -5.0.2 ------ - -Bug fixes: - -* MergeSTR now will no longer sometimes emit an alternate allele identical to the ref allele when - dealing with flanking base pairs. - -5.0.1 ------ - -Bug fixes: - -* Remove outdated call in qcSTR to np.float() - -5.0.0 ------ - -New features: - -* associaTR has been released! - -Current limitations: - -* Does not support binary phenotypes yet -* Does not support producing data for plotting individual loci yet -* Values in the output file aside from the p-values, coefficients and - standard errors have not been fully tested - -4.2.1 ------ - -Bug fixes: - -* Fix bioconda build - -4.2.0 ------ - -New features: - -* TRTools can now read VCFs produced by Beagle imputation. - -Bug fixes: - -* MergeSTR now successfully merges files containing multiple chromosomes instead of emitting - a 'stuck in infinite loop' message and crashing -* MergeSTR no longer crashes if run with the --verbose flag and the last position in each of the - VCFs being merged isn't identical. -* StatSTR now errors out if any of the files listed in --samples contain no samples that are present - in the input VCF. -* DumpSTR now reads call depth from the LC format field when the DP format field is not present. - This was intended previously but was not happening. - -Doc changes: - -* Clarified in PUBLISHING.rst how to handle dependencies and how to publish to bioconda. -* requirements.txt was unneeded, so delete it and remove the reference to it from PUBLISHING.rst - -4.1.0 ------ - -Functionality Changes: - -* MergeSTR: Flanking basepairs are now removed from HipSTR records before merging. - In particular, records with different flank lengths but the same repeat section will now merge instead of being - marked as incompatible. - -* CompareSTR: the tool now only compares records that start and end at the same position. If a partial overlap in records - is detected, the program will output a warning to the user. This warning contains IDs of the records and their positions. - -Misc: - -* mergeutils: function GetMinHarmonizedRecords was transformed into GetIncrementAndComparability, which allows the caller - to define custom predicate that decides whether records are comparable. - -4.0.2 ------ - -Bug fixes: - -* https://github.com/gymrek-lab/TRTools/issues/146 fixed record positions being compared twice -* CompareSTR: Decision on which records are comparable is now based on data from harmonized TRRecords, - and not from the records directly from VCF readers. Thanks to this, HipSTR records which have different starting positions, - but position of their repeat is at the same position are compared correctly (harmonization step removes this difference). -* MergeSTR failed on mixed ploidy samples (i.e. chrX). Fix one such bug. Note: none of the tools are - fully tested for chrX even with this fix. - - -4.0.1 ------ - -Bug fixes: - -* https://github.com/gymrek-lab/TRTools/issues/143 Fix HipstrMinSuppReads filter when - there are called samples but none have ALLREADS - -4.0.0 ------ - -Features: - -* Underlying libraries now use cyvcf2 instead of PyVCF for VCF parsing. - This makes both the underlying VCF reading code and the TRTools code - significantly faster and more memory efficient. For instance, the loading of - VCFs into memory is now > 15x faster for VCFs with many samples. - Some tools will still need further updates to be usable for large datasets, - but those updates should now be possible and much easier. - (e.g. emitting progress reports to stdout as needed, flags to disable - computations that cannot be done at such scale) -* DumpSTR has a new flag --zip to produce a bgzipped and tabix-indexed output VCF -* StatSTR now can calculate the entropy of the allele distribution at each locus with the - --entropy flag -* The `TRTools documentation website `_ now - displays the release notes. - -Command line interface changes: - -* StatSTR's --region option now requires the input VCF to be bgzipped and tabix indexed. -* If DumpSTR is used on an input VCF with unexpectedly typed - INFO fields 'AC', 'REFAC', 'HET', 'HWEP', 'HRUN' or FORMAT field 'FILTER', - it now errors out and asks you to rename those fields before rerunning - DumpSTR. (If they already exist but have the correct number and type DumpSTR - will overwrite them and issue a warning in case that was not intended) -* CompareSTR's docs used to claim that when comparing alleles from different callers - those callers must use the same allele notation (e.g. implying that ExpansionHunter's - '' and GangSTR's 'ACACACACACAC' notation would always mismatch). That statement - was never true for length based comparisons - CompareSTR has always been able to - do length based comparisons regardless of notation. The incorrect claim has been - removed from CompareSTR's docs. -* CompareSTR's docs now explicitly tell the user to order phased calls to - prevent spurious mismatching. If phasing is not desired, use --ignore-phasing -* CompareSTR will now error if at a single locus both files do not have either all - unphased calls, or all phased calls. If phasing is not desired, use --ignore-phasing - -Output changes: - -* DumpSTR call level filters now have the value of the filter and the value - which triggered the filter appended to the filter name in the FILTER format field. - (e.g. GangSTRCallMinDepth20_12 because the field had a depth of 12 and that's lower - than the required min depth of 20) -* DumpSTR locus filter HRUN is now written as HRUN and not HRUN0 in the - samplog output file -* When running DumpSTR, loci where all the calls were either already nocalls - or were filtered by call-level filters before the locus-level filters were run are now - marked as 'NO_CALLS_REMAINING' instead of 'PASS'. -* When DumpSTR filters a call and replaces each of its format fields with the no call - '.', fields with more than one value are now represented correctly. For example, - for 2 values '.,.' is used rather than just a single '.' -* MergeSTR header lines are now copied over from the input VCFs instead of - only copying over a few recognized fields (e.g. ID and Length - were the only contig fields that were previously retained, but URL wouldn't be) -* MergeSTR output alt alleles for eh and popstr are now ordered by length. - MergeSTR output alt alleles for advntr, gangstr and hipstr, when there are multiple - alt alleles of the same length, are now ordered alphabetically instead - of arbitrarily. -* CompareSTR no longer outputs the file -callcompare.tab - the existence - of that file was never documented, and besides, all its information could - be seen more easily simply by looking at the input VCFs -* In CompareSTR's overall.tab file, the ranges in the format columns are now written - [a,b) or [a,b] instead of a-b -* CompareSTR's locuscompare.tab file now outputs loci in the order they were - encountered in the input VCfs as opposed to an arbitrary order -* The 'sample' column in CompareSTR's locuscompare.tab file has been renamed to - 'numcalls' to match the other two tab files. - -Python interface changes: - -* The trtools.utils.tr_harmonizer module has been reworked to use cyvcf2, - and in doing so a large portion of its interface has changed in small ways. -* The big conceptual change is that instead of repeatedly calling a method - on a TRRecord object like GetStringGenotype for each sample in the VCF, - instead you call the new corresponding method GetStringGenotypes once, - and it returns a numpy array of values where the first axis of the array - ranges over the samples. -* The way missing calls and samples with lower than maximal - ploidy are handled is now tested and documented. These representations - of these genotypes have been aligned with cyvcf2's standards. - For more info, see the docs of the index, length and - string genotype getter methods. - -Bug fixes: - -* The AC, REFAC fields that DumpSTR output used to be incorrect, are now correct -* If you specify --drop-filtered DumpSTR will no longer set all values in the - output .loclog.tab file to zero and instead set them to their proper values - (which are the same as if you had not specified --drop-filtered) -* DumpSTR now correctly adds ##FILTER= - to the header line -* DumpSTR now no longer says HipSTRCallFlankIndels is applied to nocalls -* MergeSTR now outputs the same phase as the input files instead of always outputting - unphased data -* MergeSTR now correctly outputs Number=A, G or R (number of entries in this field equal - to number of alternate alleles at this locus, the number of alleles including the ref, - or the number of unique polyploid genotypes) correctly in INFO and FORMAT fields instead - of outputing Number=-1, -2 or -3 -* CompareSTR claimed it was outputting the square (Pearson) correlation coefficient - but was actually outputting the raw (unsquared) correlation coefficient. It is now - outputting the squared coefficient as documented. -* CompareSTR now correctly compares unphased calls without regard to order in the VCF - (e.g. 'AAAA/AAA' now matches against 'AAA/AAAA') -* CompareSTR's docs claimed the bubble plots axes were measured in basepair difference - from the reference, but they were actually measured in number of repeats different - from the reference. The behavior has not been changed and the claim has been updated - to match the behavior. -* When using binned format fields in CompareSTR where the range of values did not - evenly divide into the requested binsize, the highest valued bin used to always - be the same size as all the other bins and include values over the - limit specified by the user. Now it caps at that maximum. - E.g. binsizes 0:210:50 used to create the bins - [0,50), [50,100), [100,150), [150, 200), [200, 250) - and now create the bins - [0,50), [50,100), [100,150), [150, 200), [200, 210] -* When using binned format fields in CompareSTR where the range of values - evenly divided into the requested binsize, loci which obtained the requested - maximum would be excluded. They are now included. - E.g. binsizes 0:200:50 used to create the bins - [0,50), [50,100), [100,150), [150, 200) and samples with value 200 would - not fall into any bin. This now creates the bins - [0,50), [50,100), [100,150), [150, 200] and samples with value 200 fall into - the last bin - -Quality of life improvements: - -* StatSTR, when printing output to a file, now prints timing diagnostics to stdout. -* DumpSTR will fail faster if output directory does not exist -* When encountering issues with identifying the caller type for each input VCF, - MergeSTR now prints an error and gracefully returns instead of dying to - an uncaught exception -* MergeSTR incompatible INFO field warnings now specify which locus has an - incompatible field - -Regressions: - -* The --gangstr-require-support filter has been disabled. - -Outstanding bugs: - -* The dumpSTR ExpansionHunter ADFL ADIR ADSP filters have never worked -* DumpSTR remains untested on ExpansionHunter filters and files -* DumpSTR remains untested on loci with variable ploidy and/or partially - genotyped samples (e.g. .|2) -* When running CompareSTR with the --stratify options where --stratify-file - is either not specified or is explicitly set to zero, for each format field - all calls where the value of that field in vcf1 does not fall into the same - bin as the value of that field in vcf2 are silently not compared for that format field. - The correct behavior here is probably to create paired bins based on a range - of values from vcf1 and a range from vcf2. Regardless, the behavior here should - be documented. - -3.0.3 ------ - -Bug fixes: - -* Fixed a spot where qcSTR would crash because we passed Pandas a set instead of a list -* MergeSTR now writes out the header for the GT FORMAT field From 1fb9610687b544205979c1bd5dae13ac704603ea Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:43:34 -0800 Subject: [PATCH 06/40] delete release notes from rtd since they must be md if we want, we could keep them by using and installing mdinclude (https://sphinx-mdinclude.omnilib.dev/en/latest/example.html) --- doc/RELEASE_NOTES.rst | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 doc/RELEASE_NOTES.rst diff --git a/doc/RELEASE_NOTES.rst b/doc/RELEASE_NOTES.rst deleted file mode 100644 index eb1b1da4..00000000 --- a/doc/RELEASE_NOTES.rst +++ /dev/null @@ -1,5 +0,0 @@ -Release Notes -============= - -.. include:: ../RELEASE_NOTES.rst - From 6f647b7218b76dcd417e7fd33e2fa00c88e5eec3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:44:07 -0800 Subject: [PATCH 07/40] delete release_notes from TOC --- doc/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index 9c89d74d..f6955932 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,6 +23,5 @@ Table of Contents UTILITIES CALLERS LIBRARY - RELEASE_NOTES site_indices From 0ed7cc0197d232410d3f98a6613971f45dadab71 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:56:58 +0000 Subject: [PATCH 08/40] move pytest and pylint config into pyproject --- .coveragerc | 2 +- .pylintrc | 10 ---------- pyproject.toml | 12 +++++++++++- pytest.ini | 2 -- 4 files changed, 12 insertions(+), 14 deletions(-) delete mode 100644 .pylintrc delete mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc index b9e5b326..98c194f9 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,5 @@ [run] -omit = *test*,setup.py +omit = *test* [report] exclude_lines = diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 9062d1af..00000000 --- a/.pylintrc +++ /dev/null @@ -1,10 +0,0 @@ -[BASIC] - -function-naming-style=PascalCase -method-naming-style=PascalCase - -[MESSAGES CONTROL] - -# not interested in module naming conventions -disable=C0103 - diff --git a/pyproject.toml b/pyproject.toml index 37c37020..8d699ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,4 +56,14 @@ simTR = "trtools.simTR:run" Homepage = "https://trtools.readthedocs.org" Documentation = "https://trtools.readthedocs.org" Repository = "https://github.com/gymrek-lab/trtools.git" -Changelog = "https://github.com/gymrek-lab/trtools/blob/master/RELEASE_NOTES.rst" +Changelog = "https://github.com/gymrek-lab/trtools/blob/master/CHANGELOG.md" + +[tool.pytest.ini_options] +addopts = "-p trtools.testsupport.dataloader" + +[tool.pylint.BASIC] +function-naming-style = "PascalCase" +method-naming-style = "PascalCase" + +[tool.pylint.'MESSAGES CONTROL'] +disable = "invalid-name" # not interested in module naming conventions diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index b405bd46..00000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -addopts = -p trtools.testsupport.dataloader From ec20a373db7f48e14a9acc10b1ebc5711002d448 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:57:36 +0000 Subject: [PATCH 09/40] update to latest coverage --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a9e40e99..7a7371f9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: - name: Test with pytest shell: bash -el {0} run: | - python -m pytest --cov=. --cov-report term-missing --cov-fail-under 89 --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml + python -m pytest --cov=. --cov-report term-missing --cov-fail-under 90 --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml - name: Test command line shell: bash -el {0} run: | From 94b39a5e5ae8fd6ae2cf98f2cf75a7b8cb3e7330 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:58:24 +0000 Subject: [PATCH 10/40] use stable instead of latest RTD since latest may refer to code that's still in development --- CHANGELOG.md | 2 +- README.rst | 37 +++++++++++++++-------------------- trtools/compareSTR/README.rst | 4 ++-- trtools/qcSTR/README.rst | 2 +- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f62f39..77029f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -126,7 +126,7 @@ Current limitations: * StatSTR now can calculate the entropy of the allele distribution at each locus with the \--entropy flag * The [TRTools documentation - website](https://trtools.readthedocs.io/en/latest/) now displays the + website](https://trtools.readthedocs.io/en/stable/) now displays the release notes. ### Command line interface changes diff --git a/README.rst b/README.rst index a5eb2794..1101238f 100644 --- a/README.rst +++ b/README.rst @@ -21,7 +21,7 @@ TRTools TRTools includes a variety of utilities for filtering, quality control and analysis of tandem repeats downstream of genotyping them from next-generation sequencing. It supports multiple recent genotyping tools (see below). -See full documentation and examples at https://trtools.readthedocs.io/en/latest/. +See full documentation and examples at https://trtools.readthedocs.io/en/stable/. If you use TRTools in your work, please cite: Nima Mousavi, Jonathan Margoliash, Neha Pusarla, Shubham Saini, Richard Yanicky, Melissa Gymrek. (2020) TRTools: a toolkit for genome-wide analysis of tandem repeats. Bioinformatics. (https://doi.org/10.1093/bioinformatics/btaa736) @@ -82,14 +82,14 @@ Tools ----- TRTools includes the following tools. -* `mergeSTR `_: a tool to merge VCF files across multiple samples genotyped using the same tool -* `dumpSTR `_: a tool for filtering VCF files with TR genotypes -* `qcSTR `_: a tool for generating various quality control plots for a TR callset -* `statSTR `_: a tool for computing various statistics on VCF files -* `compareSTR `_: a tool for comparing TR callsets -* `associaTR `_: a tool for testing TR length-phenotype associations (e.g., running a TR GWAS) -* `prancSTR `_: a tool for identifying somatic mosacisim at TRs. Currently only compatible with HipSTR VCF files. (*beta mode*) -* `simTR `_: a tool for simulating next-generation sequencing reads from TR regions. (*beta mode*) +* `mergeSTR `_: a tool to merge VCF files across multiple samples genotyped using the same tool +* `dumpSTR `_: a tool for filtering VCF files with TR genotypes +* `qcSTR `_: a tool for generating various quality control plots for a TR callset +* `statSTR `_: a tool for computing various statistics on VCF files +* `compareSTR `_: a tool for comparing TR callsets +* `associaTR `_: a tool for testing TR length-phenotype associations (e.g., running a TR GWAS) +* `prancSTR `_: a tool for identifying somatic mosacisim at TRs. Currently only compatible with HipSTR VCF files. (*beta mode*) +* `simTR `_: a tool for simulating next-generation sequencing reads from TR regions. (*beta mode*) Type :code:` --help` to see a full set of options. @@ -102,8 +102,8 @@ It additionally includes a python library, :code:`trtools`, which can be accesse Usage ----- -We recommend new users start with the example commands described in the `command-line interface for each tool `_. -We also suggest going through our `vignettes `_ that walk through some example workflows using TRTools. +We recommend new users start with the example commands described in the `command-line interface for each tool `_. +We also suggest going through our `vignettes `_ that walk through some example workflows using TRTools. Supported TR Callers -------------------- @@ -115,7 +115,7 @@ TRTools supports VCFs from the following TR genotyping tools: * HipSTR_ * PopSTR_ version 2 or higher -See our description of the `features and example use-cases `_ of each of these tools. +See our description of the `features and example use-cases `_ of each of these tools. .. please ensure this list of links remains the same as the one in the main README @@ -145,16 +145,14 @@ We appreciate contributions to TRTools. If you would like to contribute a fix or #. Install TRTools from source `as above `_. #. Additionally, install :code:`pytest`, `pytest-cov `_, :code:`sphinx>=3` and :code:`sphinx_rtd_theme`, in your environment. #. Fork the TRTools repository. -#. The :code:`develop` branch contains the latest pre-release codebase. Create a branch off of :code:`develop` titled with the name of your feature. -#. Make your changes. +#. Make your changes. #. Document your changes. - * Add bullet point(s) to the 'Unreleased Changes' section of :code:`RELEASE_NOTES.rst` describing all the user facing changes you've made (if that section doesn't exist, create it at the top of the file). See prior releases in that file for examples. * Ensure all functions, modules, classes etc. conform to `numpy docstring standards `_. If applicable, update the REAMDEs in the directories of the files you changed with new usage information. - * New doc pages for `the website `_ can be created under :code:`/doc` and linked to as appropriate. + * New doc pages for `the website `_ can be created under :code:`/doc` and linked to as appropriate. * If you have added significant amounts of documentation in any of these ways, build the documentation locally to ensure it looks good. :code:`cd` to the :code:`doc` directory and run :code:`make clean && make html`, then view :code:`doc/_build/html/index.html` and navigate from there @@ -163,13 +161,10 @@ We appreciate contributions to TRTools. If you would like to contribute a fix or * :code:`cd` to the root of the project and run :code:`python -m pytest --cov=. --cov-report term-missing` to make sure that (1) all tests pass and (2) any code you have added is covered by tests. (Code coverage may **not** go down). -#. Submit a pull request **to the develop branch** of the central repository with a description of what changes you have made. +#. Submit a pull request (PR) **to the master branch** of the central repository with a description of what changes you have made. Title the PR according to the `conventional commits spec `_. A member of the TRTools team will reply and continue the contribution process from there, possibly asking for additional information/effort on your part. Publishing ---------- -If you are a TRTools maintainer and wish to publish changes from the develop branch into master and distribute them to PyPI and bioconda, -please see PUBLISHING.rst in the root of the git repo. +If you are a TRTools maintainer and wish to publish changes and distribute them to PyPI and bioconda, please see PUBLISHING.rst in the root of the git repo. If you are a community member and would like that to happen, contact us (see above). - - diff --git a/trtools/compareSTR/README.rst b/trtools/compareSTR/README.rst index e66b35f5..1ab09936 100644 --- a/trtools/compareSTR/README.rst +++ b/trtools/compareSTR/README.rst @@ -17,7 +17,7 @@ CompareSTR optionally will stratify results based on a user-specified FORMAT fie Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), use-case specific analyses may be better performed by more specialized tools. -Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. +Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. Usage ----- @@ -83,7 +83,7 @@ compareSTR outputs the following text files and plots: * :code:`-samplecompare.tab`: Has columns sample, metric-conc-seq, metric-conc-len, numcalls. One line per sample * :code:`-samplecompare.pdf`: Plots the length concordance metric for each sample considered. -See `Example Commands`_ below for example compareSTR commands for different supported TR genotypers based on example data files in this repository. More detailed use cases are also given in the vignettes https://trtools.readthedocs.io/en/develop/VIGNETTES.html. +See `Example Commands`_ below for example compareSTR commands for different supported TR genotypers based on example data files in this repository. More detailed use cases are also given in the vignettes https://trtools.readthedocs.io/en/stable/VIGNETTES.html Instructions on Compressing and Indexing VCF files -------------------------------------------------- diff --git a/trtools/qcSTR/README.rst b/trtools/qcSTR/README.rst index 538113ea..eec5e1b7 100644 --- a/trtools/qcSTR/README.rst +++ b/trtools/qcSTR/README.rst @@ -68,7 +68,7 @@ metrics. Alternatively, you may specify the type(s) of quality plot(s) you wish the :code:`--quality` option. In that case you will get a file named :code:`-quality-.pdf` for each type of plot you requested. Quality plot examples are shown below. To learn more about how qcSTR infers quality scores for VCFs from -different genotypers, see `here `_ +different genotypers, see `here `_ Note: quality score plots are useful when considered in the context of a single genotyper run, and can also be used to compare different invocations of the same genotyper. However, From caff4b9f72e2a0f4ade980f8786ece50cf1a674f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 18:31:33 +0000 Subject: [PATCH 11/40] mention maintainer of repo in PUBLISHING doc --- PUBLISHING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index ede6a6bb..6d21d93c 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -32,7 +32,7 @@ Publishing Steps To publish a new version of trtools: 1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot -2. List an admin on our repository (currently: @aryarm) as a reviewer of the PR and ask them to merge it +2. List a maintainer of our repository as a reviewer of the PR and ask them to merge it 3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number From 2b5295fc1fe6bd90863571d3b74fb6ef70d4b266 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 19:07:31 +0000 Subject: [PATCH 12/40] add min versions for all dependencies my strategy is to use whatever was the current version at the time the package was added as a dependency --- pyproject.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d699ddb..86ef0996 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,16 +20,16 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", ] dependencies = [ - "cyvcf2", - "matplotlib", - "numpy", - "pandas", - "pybedtools", - "pysam", - "scikit-learn", - "scipy", - "statsmodels", - "pyfaidx", + "cyvcf2>=0.20.5", + "matplotlib>=3.0.3", + "numpy>=1.17.3", + "pandas>=0.25.3", + "pybedtools>=0.8.0", + "pysam>=0.15.3", + "scikit-learn>=0.23.1", + "scipy>=1.3.3", + "statsmodels>=0.10.2", + "pyfaidx>=0.5.3", ] dynamic = ["version"] From d1b50ac16f087f656b6de0e7c076565bedc28968 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 12:45:40 -0800 Subject: [PATCH 13/40] refer to maintainers in publishing docs --- PUBLISHING.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index 6d21d93c..41098671 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -1,7 +1,7 @@ Publishing ---------- -Only maintainers of the trtools repository may publish changes to the package. +Only maintainers of the trtools repository (see @gymrek-lab/trtools) may publish changes to the package. If you are a community member and want to contribute new code, see the contributing section in the README. If you are a community member and have already contributed new code and want us to publish it now, please contact us (our contact info is in the README) @@ -32,7 +32,7 @@ Publishing Steps To publish a new version of trtools: 1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot -2. List a maintainer of our repository as a reviewer of the PR and ask them to merge it +2. List a maintainer of our repository (@gymrek-lab/trtools) as a reviewer of the PR and ask them to merge it 3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number From 45d8c6b9904a4716ea4946ae4a5e70d9c144c2be Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:28:03 +0000 Subject: [PATCH 14/40] Revert "add min versions for all dependencies" This reverts commit 2b5295fc1fe6bd90863571d3b74fb6ef70d4b266. --- pyproject.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 86ef0996..8d699ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,16 +20,16 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", ] dependencies = [ - "cyvcf2>=0.20.5", - "matplotlib>=3.0.3", - "numpy>=1.17.3", - "pandas>=0.25.3", - "pybedtools>=0.8.0", - "pysam>=0.15.3", - "scikit-learn>=0.23.1", - "scipy>=1.3.3", - "statsmodels>=0.10.2", - "pyfaidx>=0.5.3", + "cyvcf2", + "matplotlib", + "numpy", + "pandas", + "pybedtools", + "pysam", + "scikit-learn", + "scipy", + "statsmodels", + "pyfaidx", ] dynamic = ["version"] From a992074a98257a2277929150c77803db553753a2 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:27:15 +0000 Subject: [PATCH 15/40] add release github action workflow --- .github/workflows/release.yml | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..6be0498f --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,46 @@ +name: Release + +on: + push: + branches: + - master + +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - uses: GoogleCloudPlatform/release-please-action@v3 + id: release + with: + release-type: python + package-name: trtools + + - uses: actions/checkout@v3 + if: ${{ steps.release.outputs.release_created }} + with: + fetch-depth: 2 + + - name: Set up Python + if: ${{ steps.release.outputs.release_created }} + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Upgrade pip + if: ${{ steps.release.outputs.release_created }} + run: | + pip install --upgrade pip + pip --version + + - name: Build package + if: ${{ steps.release.outputs.release_created }} + run: | + python -m build + + - name: Publish package on PyPI + if: ${{ steps.release.outputs.release_created }} + uses: pypa/gh-action-pypi-publish@v1.5.0 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} From 6accd5e293ba6316b823ce6621fb5fa2ec88a4cc Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:40:15 +0000 Subject: [PATCH 16/40] ensure PR titles use conventional commits spec --- .github/workflows/conventional-prs.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/conventional-prs.yml diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml new file mode 100644 index 00000000..ecdcf9b0 --- /dev/null +++ b/.github/workflows/conventional-prs.yml @@ -0,0 +1,23 @@ +name: Title +on: + pull_request_target: + types: + - opened + - reopened + - edited + - synchronize + workflow_call: + +permissions: + contents: read + +jobs: + title-format: + permissions: + pull-requests: read # for amannn/action-semantic-pull-request to analyze PRs + statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5.4.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From e9527f3dc67c260a1828f0a518b63ffe7d5a1eb9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:40:44 +0000 Subject: [PATCH 17/40] update publishing instructions --- PUBLISHING.rst | 48 ++++-------------------------------------------- 1 file changed, 4 insertions(+), 44 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index af918262..426008e0 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -24,38 +24,14 @@ If you've added dependencies to trtools or its tests, those dependencies should TRTool's Read The Docs webpage. * the appropriate section of the bioconda recipe (see below) - Publishing Steps ---------------- -Once changes have been made to a branch that are ready to be published, first choose the new version number according to `semantic versioning `_. -Then set up the environment you're going to publish TRTools from: - -#. Create a clean environment. -#. Install setuptools with version >= 40.8.0 -#. Additionally, install ``pytest``, ``wheel``, ``build``, and ``twine`` -#. Clone the `trtools repo `_ -#. Check out the branch with the new work -#. Run :code:`pip install --upgrade pip && pip install -e .` - -Then go through the steps of merging the changes into the master branch: - -#. Run :code:`pytest` and make sure all the tests pass. Then run :code:`./test/cmdline_tests.sh` and make sure those tests pass. -#. Update the version number listed in the :code:`pyproject.toml` file. -#. Change the 'Unreleased Changes' section of :code:`RELEASE_NOTES.rst` to the new version number. -#. Check if any changes have been made that have not yet been documented in the release notes. If so, document them. -#. Submit a pull request from this branch into master on the github website. -#. If the code review checks pass, merge the pull request. -#. Tag the merge commit with the package version in vX.Y.Z format. (For more details on tagging, see `below`) +To publish a new version of trtools: -Then go through the steps of publishing the changed code to PyPI: - -1. :code:`cd` into the root of your clone of the trtools repo, checkout master and pull the latest change. The most recent commit should be tagged. -2. Run :code:`rm -rf build dist *.egg-info` to make sure all previous build artifacts are removed -3. Run :code:`python -m build` to build the package with the version number you just tagged. (Note: you might need to install ``build`` first.) -4. Run :code:`twine upload dist/*` to upload the distribution to PyPI - -Lastly, the change needs to be published to bioconda. +1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot +2. List an admin on our repository (currently: @aryarm) as a reviewer of the PR and ask them to merge it +3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number and the PyPI reference. If there are no new dependencies, no changes to the build, @@ -97,19 +73,3 @@ Possible Issues: * bioconda packages should not include large test data files. If the dist/trtools-.tar.gz file contains such files, you'll need to modify the MANIFEST.in file to exclude them, fix the test_trtools.sh script to download them manually and point pytest to them, confirm the tests run in a :code:`conda build` and then restart the publishing process. (This should not happen if new test files are just put in :code:`trtools/testsupport/sample_vcfs` or :code:`trtools/testsupport/sample_regions`) - -Git Tagging ------------ - -Git tags are used to mark specific commits with certain names (i.e. v1.2.0). -Please note that tags are assigned to commits, not branches. -You can tag a commit in two different ways. - -#. Command line: - -.. code-block:: bash - - git tag -a vX.Y.Z -m vX.Y.Z - git push --tags - -2. Web interface: you can go to the releases page of the repository and create a new release. From 6ce51477600e7b4c3813d468e814168258761dab Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 23 Nov 2023 00:01:31 +0000 Subject: [PATCH 18/40] also make sure to install build --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6be0498f..9bb4d24d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,6 +31,7 @@ jobs: if: ${{ steps.release.outputs.release_created }} run: | pip install --upgrade pip + pip install build pip --version - name: Build package From 18cbee89b1cf4987302906d1c35db28d9b1c6142 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:39:45 -0800 Subject: [PATCH 19/40] convert RELEASE_NOTES.rst to CHANGELOG.md --- CHANGELOG.md | 293 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..88f62f39 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,293 @@ +# Changelog + +## [5.1.0](https://github.com/gymrek-lab/TRTools/compare/v5.0.2...v5.1.0) (2023-11-22) + +### Features + +* Added prancSTR for mosaicism detection +* Added simTR for simulating NGS reads with stutter errors at TRs + +## 5.0.2 + +### Bug fixes + +* MergeSTR now will no longer sometimes emit an alternate allele + identical to the ref allele when dealing with flanking base pairs. + +## 5.0.1 + +### Bug fixes + +* Remove outdated call in qcSTR to `np.float()` + +## 5.0.0 + +### Features + +* associaTR has been released! + +Current limitations: + +* Does not support binary phenotypes yet +* Does not support producing data for plotting individual loci yet +* Values in the output file aside from the p-values, coefficients and + standard errors have not been fully tested + +## 4.2.1 + +### Bug fixes + +* Fix bioconda build + +## 4.2.0 + +### Features + +* TRTools can now read VCFs produced by Beagle imputation. + +### Bug fixes + +* MergeSTR now successfully merges files containing multiple + chromosomes instead of emitting a \'stuck in infinite loop\' message + and crashing +* MergeSTR no longer crashes if run with the \--verbose flag and the + last position in each of the VCFs being merged isn\'t identical. +* StatSTR now errors out if any of the files listed in \--samples + contain no samples that are present in the input VCF. +* DumpSTR now reads call depth from the LC format field when the DP + format field is not present. This was intended previously but was + not happening. + +### Documentation + +* Clarified in PUBLISHING.rst how to handle dependencies and how to + publish to bioconda. +* requirements.txt was unneeded, so delete it and remove the reference + to it from PUBLISHING.rst + +## 4.1.0 + +### Functionality Changes + +* MergeSTR: Flanking basepairs are now removed from HipSTR records + before merging. In particular, records with different flank lengths + but the same repeat section will now merge instead of being marked + as incompatible. +* CompareSTR: the tool now only compares records that start and end at + the same position. If a partial overlap in records is detected, the + program will output a warning to the user. This warning contains IDs + of the records and their positions. + +### Misc + +* mergeutils: function GetMinHarmonizedRecords was transformed into + GetIncrementAndComparability, which allows the caller to define + custom predicate that decides whether records are comparable. + +## 4.0.2 + +### Bug fixes + +* fixed record + positions being compared twice +* CompareSTR: Decision on which records are comparable is now based on + data from harmonized TRRecords, and not from the records directly + from VCF readers. Thanks to this, HipSTR records which have + different starting positions, but position of their repeat is at the + same position are compared correctly (harmonization step removes + this difference). +* MergeSTR failed on mixed ploidy samples (i.e. chrX). Fix one such + bug. Note: none of the tools are fully tested for chrX even with + this fix. + +## 4.0.1 + +### Bug fixes + +* Fix + HipstrMinSuppReads filter when there are called samples but none + have ALLREADS + +## 4.0.0 + +### Features + +* Underlying libraries now use cyvcf2 instead of PyVCF for VCF + parsing. This makes both the underlying VCF reading code and the + TRTools code significantly faster and more memory efficient. For + instance, the loading of VCFs into memory is now \> 15x faster for + VCFs with many samples. Some tools will still need further updates + to be usable for large datasets, but those updates should now be + possible and much easier. (e.g. emitting progress reports to stdout + as needed, flags to disable computations that cannot be done at such + scale) +* DumpSTR has a new flag \--zip to produce a bgzipped and + tabix-indexed output VCF +* StatSTR now can calculate the entropy of the allele distribution at + each locus with the \--entropy flag +* The [TRTools documentation + website](https://trtools.readthedocs.io/en/latest/) now displays the + release notes. + +### Command line interface changes + +* StatSTR\'s \--region option now requires the input VCF to be + bgzipped and tabix indexed. +* If DumpSTR is used on an input VCF with unexpectedly typed INFO + fields \'AC\', \'REFAC\', \'HET\', \'HWEP\', \'HRUN\' or FORMAT + field \'FILTER\', it now errors out and asks you to rename those + fields before rerunning DumpSTR. (If they already exist but have the + correct number and type DumpSTR will overwrite them and issue a + warning in case that was not intended) +* CompareSTR\'s docs used to claim that when comparing alleles from + different callers those callers must use the same allele notation + (e.g. implying that ExpansionHunter\'s \'\\' and GangSTR\'s + \'ACACACACACAC\' notation would always mismatch). That statement was + never true for length based comparisons - CompareSTR has always been + able to do length based comparisons regardless of notation. The + incorrect claim has been removed from CompareSTR\'s docs. +* CompareSTR\'s docs now explicitly tell the user to order phased + calls to prevent spurious mismatching. If phasing is not desired, + use \--ignore-phasing +* CompareSTR will now error if at a single locus both files do not + have either all unphased calls, or all phased calls. If phasing is + not desired, use \--ignore-phasing + +### Output changes + +* DumpSTR call level filters now have the value of the filter and the + value which triggered the filter appended to the filter name in the + FILTER format field. (e.g. GangSTRCallMinDepth20_12 because the + field had a depth of 12 and that\'s lower than the required min + depth of 20) +* DumpSTR locus filter HRUN is now written as HRUN and not HRUN0 in + the samplog output file +* When running DumpSTR, loci where all the calls were either already + nocalls or were filtered by call-level filters before the + locus-level filters were run are now marked as + \'NO_CALLS_REMAINING\' instead of \'PASS\'. +* When DumpSTR filters a call and replaces each of its format fields + with the no call \'.\', fields with more than one value are now + represented correctly. For example, for 2 values \'.,.\' is used + rather than just a single \'.\' +* MergeSTR header lines are now copied over from the input VCFs + instead of only copying over a few recognized fields (e.g. ID and + Length were the only contig fields that were previously retained, + but URL wouldn\'t be) +* MergeSTR output alt alleles for eh and popstr are now ordered by + length. MergeSTR output alt alleles for advntr, gangstr and hipstr, + when there are multiple alt alleles of the same length, are now + ordered alphabetically instead of arbitrarily. +* CompareSTR no longer outputs the file \-callcompare.tab - + the existence of that file was never documented, and besides, all + its information could be seen more easily simply by looking at the + input VCFs +* In CompareSTR\'s overall.tab file, the ranges in the format columns + are now written \[a,b) or \[a,b\] instead of a-b +* CompareSTR\'s locuscompare.tab file now outputs loci in the order + they were encountered in the input VCfs as opposed to an arbitrary + order +* The \'sample\' column in CompareSTR\'s locuscompare.tab file has + been renamed to \'numcalls\' to match the other two tab files. + +### Python interface changes + +* The trtools.utils.tr_harmonizer module has been reworked to use + cyvcf2, and in doing so a large portion of its interface has changed + in small ways. +* The big conceptual change is that instead of repeatedly calling a + method on a TRRecord object like GetStringGenotype for each sample + in the VCF, instead you call the new corresponding method + GetStringGenotypes once, and it returns a numpy array of values + where the first axis of the array ranges over the samples. +* The way missing calls and samples with lower than maximal ploidy are + handled is now tested and documented. These representations of these + genotypes have been aligned with cyvcf2\'s standards. For more info, + see the docs of the index, length and string genotype getter + methods. + +### Bug fixes + +* The AC, REFAC fields that DumpSTR output used to be incorrect, are + now correct +* If you specify \--drop-filtered DumpSTR will no longer set all + values in the output .loclog.tab file to zero and instead set them + to their proper values (which are the same as if you had not + specified \--drop-filtered) +* DumpSTR now correctly adds ##FILTER=\ to the header line +* DumpSTR now no longer says HipSTRCallFlankIndels is applied to + nocalls +* MergeSTR now outputs the same phase as the input files instead of + always outputting unphased data +* MergeSTR now correctly outputs Number=A, G or R (number of entries + in this field equal to number of alternate alleles at this locus, + the number of alleles including the ref, or the number of unique + polyploid genotypes) correctly in INFO and FORMAT fields instead of + outputing Number=-1, -2 or -3 +* CompareSTR claimed it was outputting the square (Pearson) + correlation coefficient but was actually outputting the raw + (unsquared) correlation coefficient. It is now outputting the + squared coefficient as documented. +* CompareSTR now correctly compares unphased calls without regard to + order in the VCF (e.g. \'AAAA/AAA\' now matches against + \'AAA/AAAA\') +* CompareSTR\'s docs claimed the bubble plots axes were measured in + basepair difference from the reference, but they were actually + measured in number of repeats different from the reference. The + behavior has not been changed and the claim has been updated to + match the behavior. +* When using binned format fields in CompareSTR where the range of + values did not evenly divide into the requested binsize, the highest + valued bin used to always be the same size as all the other bins and + include values over the limit specified by the user. Now it caps at + that maximum. E.g. binsizes 0:210:50 used to create the bins + \[0,50), \[50,100), \[100,150), \[150, 200), \[200, 250) and now + create the bins \[0,50), \[50,100), \[100,150), \[150, 200), \[200, + 210\] +* When using binned format fields in CompareSTR where the range of + values evenly divided into the requested binsize, loci which + obtained the requested maximum would be excluded. They are now + included. E.g. binsizes 0:200:50 used to create the bins \[0,50), + \[50,100), \[100,150), \[150, 200) and samples with value 200 would + not fall into any bin. This now creates the bins \[0,50), \[50,100), + \[100,150), \[150, 200\] and samples with value 200 fall into the + last bin + +### Quality of life improvements + +* StatSTR, when printing output to a file, now prints timing + diagnostics to stdout. +* DumpSTR will fail faster if output directory does not exist +* When encountering issues with identifying the caller type for each + input VCF, MergeSTR now prints an error and gracefully returns + instead of dying to an uncaught exception +* MergeSTR incompatible INFO field warnings now specify which locus + has an incompatible field + +### Regressions + +* The \--gangstr-require-support filter has been disabled. + +### Outstanding bugs + +* The dumpSTR ExpansionHunter ADFL ADIR ADSP filters have never worked +* DumpSTR remains untested on ExpansionHunter filters and files +* DumpSTR remains untested on loci with variable ploidy and/or + partially genotyped samples (e.g. .\|2) +* When running CompareSTR with the \--stratify options where + \--stratify-file is either not specified or is explicitly set to + zero, for each format field all calls where the value of that field + in vcf1 does not fall into the same bin as the value of that field + in vcf2 are silently not compared for that format field. The correct + behavior here is probably to create paired bins based on a range of + values from vcf1 and a range from vcf2. Regardless, the behavior + here should be documented. + +## 3.0.3 + +### Bug fixes + +* Fixed a spot where qcSTR would crash because we passed Pandas a set + instead of a list +* MergeSTR now writes out the header for the GT FORMAT field From d589b9657beb546ad01cd43284e1ef52d1f62a34 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:43:34 -0800 Subject: [PATCH 20/40] delete release notes from rtd since they must be md if we want, we could keep them by using and installing mdinclude (https://sphinx-mdinclude.omnilib.dev/en/latest/example.html) --- doc/RELEASE_NOTES.rst | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 doc/RELEASE_NOTES.rst diff --git a/doc/RELEASE_NOTES.rst b/doc/RELEASE_NOTES.rst deleted file mode 100644 index eb1b1da4..00000000 --- a/doc/RELEASE_NOTES.rst +++ /dev/null @@ -1,5 +0,0 @@ -Release Notes -============= - -.. include:: ../RELEASE_NOTES.rst - From 8e3937e5c6fd8cad94ad20f5cd1bd9ffaa317209 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:44:07 -0800 Subject: [PATCH 21/40] delete release_notes from TOC --- doc/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index 9c89d74d..f6955932 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,6 +23,5 @@ Table of Contents UTILITIES CALLERS LIBRARY - RELEASE_NOTES site_indices From db387a6abfa084178f24099fe3f639f8b23d13fe Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:56:58 +0000 Subject: [PATCH 22/40] move pytest and pylint config into pyproject --- .coveragerc | 2 +- .pylintrc | 10 ---------- pyproject.toml | 12 +++++++++++- pytest.ini | 2 -- 4 files changed, 12 insertions(+), 14 deletions(-) delete mode 100644 .pylintrc delete mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc index b9e5b326..98c194f9 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,5 @@ [run] -omit = *test*,setup.py +omit = *test* [report] exclude_lines = diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 9062d1af..00000000 --- a/.pylintrc +++ /dev/null @@ -1,10 +0,0 @@ -[BASIC] - -function-naming-style=PascalCase -method-naming-style=PascalCase - -[MESSAGES CONTROL] - -# not interested in module naming conventions -disable=C0103 - diff --git a/pyproject.toml b/pyproject.toml index 25fb540b..db44d1d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,4 +55,14 @@ simTR = "trtools.simTR:run" Homepage = "https://trtools.readthedocs.org" Documentation = "https://trtools.readthedocs.org" Repository = "https://github.com/gymrek-lab/trtools.git" -Changelog = "https://github.com/gymrek-lab/trtools/blob/master/RELEASE_NOTES.rst" +Changelog = "https://github.com/gymrek-lab/trtools/blob/master/CHANGELOG.md" + +[tool.pytest.ini_options] +addopts = "-p trtools.testsupport.dataloader" + +[tool.pylint.BASIC] +function-naming-style = "PascalCase" +method-naming-style = "PascalCase" + +[tool.pylint.'MESSAGES CONTROL'] +disable = "invalid-name" # not interested in module naming conventions diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index b405bd46..00000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -addopts = -p trtools.testsupport.dataloader From 1c5181bbd2882d1000344a537e7fce1cc41b4fca Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:57:36 +0000 Subject: [PATCH 23/40] update to latest coverage --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a9e40e99..7a7371f9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: - name: Test with pytest shell: bash -el {0} run: | - python -m pytest --cov=. --cov-report term-missing --cov-fail-under 89 --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml + python -m pytest --cov=. --cov-report term-missing --cov-fail-under 90 --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml - name: Test command line shell: bash -el {0} run: | From a8883f1aec81d9df0427cf249859acaad071b011 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:58:24 +0000 Subject: [PATCH 24/40] use stable instead of latest RTD since latest may refer to code that's still in development --- CHANGELOG.md | 2 +- README.rst | 34 +++++++++++++++------------------- trtools/compareSTR/README.rst | 4 ++-- trtools/qcSTR/README.rst | 2 +- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f62f39..77029f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -126,7 +126,7 @@ Current limitations: * StatSTR now can calculate the entropy of the allele distribution at each locus with the \--entropy flag * The [TRTools documentation - website](https://trtools.readthedocs.io/en/latest/) now displays the + website](https://trtools.readthedocs.io/en/stable/) now displays the release notes. ### Command line interface changes diff --git a/README.rst b/README.rst index 2a147a19..4c65fbc7 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ TRTools TRTools includes a variety of utilities for filtering, quality control and analysis of tandem repeats downstream of genotyping them from next-generation sequencing. It supports multiple recent genotyping tools (see below). -See full documentation and examples at https://trtools.readthedocs.io/en/latest/. +See full documentation and examples at https://trtools.readthedocs.io/en/stable/. If you use TRTools in your work, please cite: Nima Mousavi, Jonathan Margoliash, Neha Pusarla, Shubham Saini, Richard Yanicky, Melissa Gymrek. (2020) TRTools: a toolkit for genome-wide analysis of tandem repeats. Bioinformatics. (https://doi.org/10.1093/bioinformatics/btaa736) @@ -89,14 +89,14 @@ Tools ----- TRTools includes the following tools. -* `mergeSTR `_: a tool to merge VCF files across multiple samples genotyped using the same tool -* `dumpSTR `_: a tool for filtering VCF files with TR genotypes -* `qcSTR `_: a tool for generating various quality control plots for a TR callset -* `statSTR `_: a tool for computing various statistics on VCF files -* `compareSTR `_: a tool for comparing TR callsets -* `associaTR `_: a tool for testing TR length-phenotype associations (e.g., running a TR GWAS) -* `prancSTR `_: a tool for identifying somatic mosacisim at TRs. Currently only compatible with HipSTR VCF files. (*beta mode*) -* `simTR `_: a tool for simulating next-generation sequencing reads from TR regions. (*beta mode*) +* `mergeSTR `_: a tool to merge VCF files across multiple samples genotyped using the same tool +* `dumpSTR `_: a tool for filtering VCF files with TR genotypes +* `qcSTR `_: a tool for generating various quality control plots for a TR callset +* `statSTR `_: a tool for computing various statistics on VCF files +* `compareSTR `_: a tool for comparing TR callsets +* `associaTR `_: a tool for testing TR length-phenotype associations (e.g., running a TR GWAS) +* `prancSTR `_: a tool for identifying somatic mosacisim at TRs. Currently only compatible with HipSTR VCF files. (*beta mode*) +* `simTR `_: a tool for simulating next-generation sequencing reads from TR regions. (*beta mode*) Type :code:` --help` to see a full set of options. @@ -109,8 +109,8 @@ It additionally includes a python library, :code:`trtools`, which can be accesse Usage ----- -We recommend new users start with the example commands described in the `command-line interface for each tool `_. -We also suggest going through our `vignettes `_ that walk through some example workflows using TRTools. +We recommend new users start with the example commands described in the `command-line interface for each tool `_. +We also suggest going through our `vignettes `_ that walk through some example workflows using TRTools. Supported TR Callers -------------------- @@ -122,7 +122,7 @@ TRTools supports VCFs from the following TR genotyping tools: * HipSTR_ * PopSTR_ version 2 or higher -See our description of the `features and example use-cases `_ of each of these tools. +See our description of the `features and example use-cases `_ of each of these tools. .. please ensure this list of links remains the same as the one in the main README @@ -156,12 +156,11 @@ We appreciate contributions to TRTools. If you would like to contribute a fix or #. Make your changes. #. Document your changes. - * Add bullet point(s) to the 'Unreleased Changes' section of :code:`RELEASE_NOTES.rst` describing all the user facing changes you've made (if that section doesn't exist, create it at the top of the file). See prior releases in that file for examples. * Ensure all functions, modules, classes etc. conform to `numpy docstring standards `_. If applicable, update the REAMDEs in the directories of the files you changed with new usage information. - * New doc pages for `the website `_ can be created under :code:`/doc` and linked to as appropriate. + * New doc pages for `the website `_ can be created under :code:`/doc` and linked to as appropriate. * If you have added significant amounts of documentation in any of these ways, build the documentation locally to ensure it looks good. :code:`cd` to the :code:`doc` directory and run :code:`make clean && make html`, then view :code:`doc/_build/html/index.html` and navigate from there @@ -170,13 +169,10 @@ We appreciate contributions to TRTools. If you would like to contribute a fix or * :code:`cd` to the root of the project and run :code:`python -m pytest --cov=. --cov-report term-missing` to make sure that (1) all tests pass and (2) any code you have added is covered by tests. (Code coverage may **not** go down). -#. Submit a pull request to the master branch of the central repository with a description of what changes you have made. +#. Submit a pull request (PR) **to the master branch** of the central repository with a description of what changes you have made. Title the PR according to the `conventional commits spec `_. A member of the TRTools team will reply and continue the contribution process from there, possibly asking for additional information/effort on your part. Publishing ---------- -If you are a TRTools maintainer and wish to publish changes from the develop branch into master and distribute them to PyPI and bioconda, -please see PUBLISHING.rst in the root of the git repo. +If you are a TRTools maintainer and wish to publish changes and distribute them to PyPI and bioconda, please see PUBLISHING.rst in the root of the git repo. If you are a community member and would like that to happen, contact us (see above). - - diff --git a/trtools/compareSTR/README.rst b/trtools/compareSTR/README.rst index e66b35f5..1ab09936 100644 --- a/trtools/compareSTR/README.rst +++ b/trtools/compareSTR/README.rst @@ -17,7 +17,7 @@ CompareSTR optionally will stratify results based on a user-specified FORMAT fie Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), use-case specific analyses may be better performed by more specialized tools. -Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. +Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. Usage ----- @@ -83,7 +83,7 @@ compareSTR outputs the following text files and plots: * :code:`-samplecompare.tab`: Has columns sample, metric-conc-seq, metric-conc-len, numcalls. One line per sample * :code:`-samplecompare.pdf`: Plots the length concordance metric for each sample considered. -See `Example Commands`_ below for example compareSTR commands for different supported TR genotypers based on example data files in this repository. More detailed use cases are also given in the vignettes https://trtools.readthedocs.io/en/develop/VIGNETTES.html. +See `Example Commands`_ below for example compareSTR commands for different supported TR genotypers based on example data files in this repository. More detailed use cases are also given in the vignettes https://trtools.readthedocs.io/en/stable/VIGNETTES.html Instructions on Compressing and Indexing VCF files -------------------------------------------------- diff --git a/trtools/qcSTR/README.rst b/trtools/qcSTR/README.rst index 538113ea..eec5e1b7 100644 --- a/trtools/qcSTR/README.rst +++ b/trtools/qcSTR/README.rst @@ -68,7 +68,7 @@ metrics. Alternatively, you may specify the type(s) of quality plot(s) you wish the :code:`--quality` option. In that case you will get a file named :code:`-quality-.pdf` for each type of plot you requested. Quality plot examples are shown below. To learn more about how qcSTR infers quality scores for VCFs from -different genotypers, see `here `_ +different genotypers, see `here `_ Note: quality score plots are useful when considered in the context of a single genotyper run, and can also be used to compare different invocations of the same genotyper. However, From ff9bec37233b5472a829f5a9d012104355bf9157 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 18:31:33 +0000 Subject: [PATCH 25/40] mention maintainer of repo in PUBLISHING doc --- PUBLISHING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index 426008e0..dd80ea97 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -30,7 +30,7 @@ Publishing Steps To publish a new version of trtools: 1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot -2. List an admin on our repository (currently: @aryarm) as a reviewer of the PR and ask them to merge it +2. List a maintainer of our repository as a reviewer of the PR and ask them to merge it 3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number From 2941653bfe4d1b3694fd2dee4b23e66c805d6e2f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 19:07:31 +0000 Subject: [PATCH 26/40] add min versions for all dependencies my strategy is to use whatever was the current version at the time the package was added as a dependency --- pyproject.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index db44d1d9..489ffb1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,16 +22,16 @@ classifiers = [ ] dependencies = [ "importlib-metadata", # required as long as we support py<3.8 - "cyvcf2", - "matplotlib", - "numpy", - "pandas", - "pybedtools", - "pysam", - "scikit-learn", - "scipy", - "statsmodels", - "pyfaidx", + "cyvcf2>=0.20.5", + "matplotlib>=3.0.3", + "numpy>=1.17.3", + "pandas>=0.25.3", + "pybedtools>=0.8.0", + "pysam>=0.15.3", + "scikit-learn>=0.23.1", + "scipy>=1.3.3", + "statsmodels>=0.10.2", + "pyfaidx>=0.5.3", ] [tool.setuptools] From c01e4049f5f7693587c25e2cd4dc54d54572cc2a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Dec 2023 12:45:40 -0800 Subject: [PATCH 27/40] refer to maintainers in publishing docs --- PUBLISHING.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index dd80ea97..72cd3125 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -1,7 +1,7 @@ Publishing ---------- -Only maintainers of the trtools repository may publish changes to the package. +Only maintainers of the trtools repository (see @gymrek-lab/trtools) may publish changes to the package. If you are a community member and want to contribute new code, see the contributing section in the README. If you are a community member and have already contributed new code and want us to publish it now, please contact us (our contact info is in the README) @@ -30,7 +30,7 @@ Publishing Steps To publish a new version of trtools: 1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot -2. List a maintainer of our repository as a reviewer of the PR and ask them to merge it +2. List a maintainer of our repository (@gymrek-lab/trtools) as a reviewer of the PR and ask them to merge it 3. The bot will automatically create a new version on PyPI and tag a release on Github A bioconda bot will automatically open a pull request (within a day?) updating the version number From b1f4b2bc1153fbe8142b29a2259bba07f5aa5162 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:28:03 +0000 Subject: [PATCH 28/40] Revert "add min versions for all dependencies" This reverts commit 2b5295fc1fe6bd90863571d3b74fb6ef70d4b266. --- pyproject.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 489ffb1a..db44d1d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,16 +22,16 @@ classifiers = [ ] dependencies = [ "importlib-metadata", # required as long as we support py<3.8 - "cyvcf2>=0.20.5", - "matplotlib>=3.0.3", - "numpy>=1.17.3", - "pandas>=0.25.3", - "pybedtools>=0.8.0", - "pysam>=0.15.3", - "scikit-learn>=0.23.1", - "scipy>=1.3.3", - "statsmodels>=0.10.2", - "pyfaidx>=0.5.3", + "cyvcf2", + "matplotlib", + "numpy", + "pandas", + "pybedtools", + "pysam", + "scikit-learn", + "scipy", + "statsmodels", + "pyfaidx", ] [tool.setuptools] From ac264bc3630d7f80c19eeadec25723d74e0cd13d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Dec 2023 20:27:42 +0000 Subject: [PATCH 29/40] clean up after messy merge --- PUBLISHING.rst | 5 ++--- README.rst | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index 31d94ee1..2673db51 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -1,11 +1,8 @@ Publishing ---------- -Only maintainers of the trtools repository (see @gymrek-lab/trtools) may publish changes to the package. Only maintainers of the trtools repository (see @gymrek-lab/trtools) may publish changes to the package. If you are a community member and want to contribute new code, see the contributing section in the README. -If you are a community member and have already contributed new code and want us to publish it -now, please contact us (our contact info is in the README) This document explains how trtools maintainers should publish new changes. Maintainers should reach consensus before going ahead with publishing changes. @@ -34,6 +31,8 @@ To publish a new version of trtools: 2. List a maintainer of our repository (@gymrek-lab/trtools) as a reviewer of the PR and ask them to merge it 3. The bot will automatically create a new version on PyPI and tag a release on Github +Lastly, the change needs to be published to bioconda. + A bioconda bot will automatically open a pull request (within a day?) updating the version number and the PyPI reference. If there are no new dependencies, no changes to the build, and no new tests that need to be integrated into the build, and all we need to do is mark that PR as okay. diff --git a/README.rst b/README.rst index 9ea494b7..4c65fbc7 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,6 @@ TRTools TRTools includes a variety of utilities for filtering, quality control and analysis of tandem repeats downstream of genotyping them from next-generation sequencing. It supports multiple recent genotyping tools (see below). -See full documentation and examples at https://trtools.readthedocs.io/en/stable/. See full documentation and examples at https://trtools.readthedocs.io/en/stable/. If you use TRTools in your work, please cite: Nima Mousavi, Jonathan Margoliash, Neha Pusarla, Shubham Saini, Richard Yanicky, Melissa Gymrek. (2020) TRTools: a toolkit for genome-wide analysis of tandem repeats. Bioinformatics. (https://doi.org/10.1093/bioinformatics/btaa736) From f01f7208d6d5a950ce4c571b84f877f648865153 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 20 Dec 2023 19:04:23 +0000 Subject: [PATCH 30/40] try to add py3.7 to ci --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7a7371f9..bd957573 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 From 4a152e5a76cb941e5b89fb57f6227facbc0e0c0e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 20 Dec 2023 19:07:17 +0000 Subject: [PATCH 31/40] remove setuptools-scm from test action --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bd957573..8feaa447 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,7 +33,7 @@ jobs: - name: Upgrade pip and install our package shell: bash -el {0} run: | - python -m pip install --upgrade pip setuptools wheel setuptools-scm + python -m pip install --upgrade pip setuptools wheel pip install -e . - name: Test with pytest shell: bash -el {0} From 5838f69af2bca71a6e47e5ecdae45cdcdab6fb10 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:01:00 -0800 Subject: [PATCH 32/40] Delete RELEASE_NOTES.rst --- RELEASE_NOTES.rst | 272 ---------------------------------------------- 1 file changed, 272 deletions(-) delete mode 100644 RELEASE_NOTES.rst diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst deleted file mode 100644 index 3db2629b..00000000 --- a/RELEASE_NOTES.rst +++ /dev/null @@ -1,272 +0,0 @@ -5.1.1 ------ - -Bug fixes: - -* Remove stray files from source distribution - -5.1.0 ------ - -New features: - -* Added prancSTR for mosaicism detection -* Added simTR for simulating NGS reads with stutter errors at TRs - -5.0.2 ------ - -Bug fixes: - -* MergeSTR now will no longer sometimes emit an alternate allele identical to the ref allele when - dealing with flanking base pairs. - -5.0.1 ------ - -Bug fixes: - -* Remove outdated call in qcSTR to np.float() - -5.0.0 ------ - -New features: - -* associaTR has been released! - -Current limitations: - -* Does not support binary phenotypes yet -* Does not support producing data for plotting individual loci yet -* Values in the output file aside from the p-values, coefficients and - standard errors have not been fully tested - -4.2.1 ------ - -Bug fixes: - -* Fix bioconda build - -4.2.0 ------ - -New features: - -* TRTools can now read VCFs produced by Beagle imputation. - -Bug fixes: - -* MergeSTR now successfully merges files containing multiple chromosomes instead of emitting - a 'stuck in infinite loop' message and crashing -* MergeSTR no longer crashes if run with the --verbose flag and the last position in each of the - VCFs being merged isn't identical. -* StatSTR now errors out if any of the files listed in --samples contain no samples that are present - in the input VCF. -* DumpSTR now reads call depth from the LC format field when the DP format field is not present. - This was intended previously but was not happening. - -Doc changes: - -* Clarified in PUBLISHING.rst how to handle dependencies and how to publish to bioconda. -* requirements.txt was unneeded, so delete it and remove the reference to it from PUBLISHING.rst - -4.1.0 ------ - -Functionality Changes: - -* MergeSTR: Flanking basepairs are now removed from HipSTR records before merging. - In particular, records with different flank lengths but the same repeat section will now merge instead of being - marked as incompatible. - -* CompareSTR: the tool now only compares records that start and end at the same position. If a partial overlap in records - is detected, the program will output a warning to the user. This warning contains IDs of the records and their positions. - -Misc: - -* mergeutils: function GetMinHarmonizedRecords was transformed into GetIncrementAndComparability, which allows the caller - to define custom predicate that decides whether records are comparable. - -4.0.2 ------ - -Bug fixes: - -* https://github.com/gymrek-lab/TRTools/issues/146 fixed record positions being compared twice -* CompareSTR: Decision on which records are comparable is now based on data from harmonized TRRecords, - and not from the records directly from VCF readers. Thanks to this, HipSTR records which have different starting positions, - but position of their repeat is at the same position are compared correctly (harmonization step removes this difference). -* MergeSTR failed on mixed ploidy samples (i.e. chrX). Fix one such bug. Note: none of the tools are - fully tested for chrX even with this fix. - - -4.0.1 ------ - -Bug fixes: - -* https://github.com/gymrek-lab/TRTools/issues/143 Fix HipstrMinSuppReads filter when - there are called samples but none have ALLREADS - -4.0.0 ------ - -Features: - -* Underlying libraries now use cyvcf2 instead of PyVCF for VCF parsing. - This makes both the underlying VCF reading code and the TRTools code - significantly faster and more memory efficient. For instance, the loading of - VCFs into memory is now > 15x faster for VCFs with many samples. - Some tools will still need further updates to be usable for large datasets, - but those updates should now be possible and much easier. - (e.g. emitting progress reports to stdout as needed, flags to disable - computations that cannot be done at such scale) -* DumpSTR has a new flag --zip to produce a bgzipped and tabix-indexed output VCF -* StatSTR now can calculate the entropy of the allele distribution at each locus with the - --entropy flag -* The `TRTools documentation website `_ now - displays the release notes. - -Command line interface changes: - -* StatSTR's --region option now requires the input VCF to be bgzipped and tabix indexed. -* If DumpSTR is used on an input VCF with unexpectedly typed - INFO fields 'AC', 'REFAC', 'HET', 'HWEP', 'HRUN' or FORMAT field 'FILTER', - it now errors out and asks you to rename those fields before rerunning - DumpSTR. (If they already exist but have the correct number and type DumpSTR - will overwrite them and issue a warning in case that was not intended) -* CompareSTR's docs used to claim that when comparing alleles from different callers - those callers must use the same allele notation (e.g. implying that ExpansionHunter's - '' and GangSTR's 'ACACACACACAC' notation would always mismatch). That statement - was never true for length based comparisons - CompareSTR has always been able to - do length based comparisons regardless of notation. The incorrect claim has been - removed from CompareSTR's docs. -* CompareSTR's docs now explicitly tell the user to order phased calls to - prevent spurious mismatching. If phasing is not desired, use --ignore-phasing -* CompareSTR will now error if at a single locus both files do not have either all - unphased calls, or all phased calls. If phasing is not desired, use --ignore-phasing - -Output changes: - -* DumpSTR call level filters now have the value of the filter and the value - which triggered the filter appended to the filter name in the FILTER format field. - (e.g. GangSTRCallMinDepth20_12 because the field had a depth of 12 and that's lower - than the required min depth of 20) -* DumpSTR locus filter HRUN is now written as HRUN and not HRUN0 in the - samplog output file -* When running DumpSTR, loci where all the calls were either already nocalls - or were filtered by call-level filters before the locus-level filters were run are now - marked as 'NO_CALLS_REMAINING' instead of 'PASS'. -* When DumpSTR filters a call and replaces each of its format fields with the no call - '.', fields with more than one value are now represented correctly. For example, - for 2 values '.,.' is used rather than just a single '.' -* MergeSTR header lines are now copied over from the input VCFs instead of - only copying over a few recognized fields (e.g. ID and Length - were the only contig fields that were previously retained, but URL wouldn't be) -* MergeSTR output alt alleles for eh and popstr are now ordered by length. - MergeSTR output alt alleles for advntr, gangstr and hipstr, when there are multiple - alt alleles of the same length, are now ordered alphabetically instead - of arbitrarily. -* CompareSTR no longer outputs the file -callcompare.tab - the existence - of that file was never documented, and besides, all its information could - be seen more easily simply by looking at the input VCFs -* In CompareSTR's overall.tab file, the ranges in the format columns are now written - [a,b) or [a,b] instead of a-b -* CompareSTR's locuscompare.tab file now outputs loci in the order they were - encountered in the input VCfs as opposed to an arbitrary order -* The 'sample' column in CompareSTR's locuscompare.tab file has been renamed to - 'numcalls' to match the other two tab files. - -Python interface changes: - -* The trtools.utils.tr_harmonizer module has been reworked to use cyvcf2, - and in doing so a large portion of its interface has changed in small ways. -* The big conceptual change is that instead of repeatedly calling a method - on a TRRecord object like GetStringGenotype for each sample in the VCF, - instead you call the new corresponding method GetStringGenotypes once, - and it returns a numpy array of values where the first axis of the array - ranges over the samples. -* The way missing calls and samples with lower than maximal - ploidy are handled is now tested and documented. These representations - of these genotypes have been aligned with cyvcf2's standards. - For more info, see the docs of the index, length and - string genotype getter methods. - -Bug fixes: - -* The AC, REFAC fields that DumpSTR output used to be incorrect, are now correct -* If you specify --drop-filtered DumpSTR will no longer set all values in the - output .loclog.tab file to zero and instead set them to their proper values - (which are the same as if you had not specified --drop-filtered) -* DumpSTR now correctly adds ##FILTER= - to the header line -* DumpSTR now no longer says HipSTRCallFlankIndels is applied to nocalls -* MergeSTR now outputs the same phase as the input files instead of always outputting - unphased data -* MergeSTR now correctly outputs Number=A, G or R (number of entries in this field equal - to number of alternate alleles at this locus, the number of alleles including the ref, - or the number of unique polyploid genotypes) correctly in INFO and FORMAT fields instead - of outputing Number=-1, -2 or -3 -* CompareSTR claimed it was outputting the square (Pearson) correlation coefficient - but was actually outputting the raw (unsquared) correlation coefficient. It is now - outputting the squared coefficient as documented. -* CompareSTR now correctly compares unphased calls without regard to order in the VCF - (e.g. 'AAAA/AAA' now matches against 'AAA/AAAA') -* CompareSTR's docs claimed the bubble plots axes were measured in basepair difference - from the reference, but they were actually measured in number of repeats different - from the reference. The behavior has not been changed and the claim has been updated - to match the behavior. -* When using binned format fields in CompareSTR where the range of values did not - evenly divide into the requested binsize, the highest valued bin used to always - be the same size as all the other bins and include values over the - limit specified by the user. Now it caps at that maximum. - E.g. binsizes 0:210:50 used to create the bins - [0,50), [50,100), [100,150), [150, 200), [200, 250) - and now create the bins - [0,50), [50,100), [100,150), [150, 200), [200, 210] -* When using binned format fields in CompareSTR where the range of values - evenly divided into the requested binsize, loci which obtained the requested - maximum would be excluded. They are now included. - E.g. binsizes 0:200:50 used to create the bins - [0,50), [50,100), [100,150), [150, 200) and samples with value 200 would - not fall into any bin. This now creates the bins - [0,50), [50,100), [100,150), [150, 200] and samples with value 200 fall into - the last bin - -Quality of life improvements: - -* StatSTR, when printing output to a file, now prints timing diagnostics to stdout. -* DumpSTR will fail faster if output directory does not exist -* When encountering issues with identifying the caller type for each input VCF, - MergeSTR now prints an error and gracefully returns instead of dying to - an uncaught exception -* MergeSTR incompatible INFO field warnings now specify which locus has an - incompatible field - -Regressions: - -* The --gangstr-require-support filter has been disabled. - -Outstanding bugs: - -* The dumpSTR ExpansionHunter ADFL ADIR ADSP filters have never worked -* DumpSTR remains untested on ExpansionHunter filters and files -* DumpSTR remains untested on loci with variable ploidy and/or partially - genotyped samples (e.g. .|2) -* When running CompareSTR with the --stratify options where --stratify-file - is either not specified or is explicitly set to zero, for each format field - all calls where the value of that field in vcf1 does not fall into the same - bin as the value of that field in vcf2 are silently not compared for that format field. - The correct behavior here is probably to create paired bins based on a range - of values from vcf1 and a range from vcf2. Regardless, the behavior here should - be documented. - -3.0.3 ------ - -Bug fixes: - -* Fixed a spot where qcSTR would crash because we passed Pandas a set instead of a list -* MergeSTR now writes out the header for the GT FORMAT field From 4536543c041f219eb7ad6d76d9a3215442730a00 Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 13:01:37 -0800 Subject: [PATCH 33/40] adding new option to mergeSTR to give file with list of VCFs --- test/cmdline_tests.sh | 9 +++++++++ trtools/mergeSTR/README.rst | 3 ++- trtools/mergeSTR/mergeSTR.py | 15 +++++++++++++-- trtools/mergeSTR/tests/test_mergeSTR.py | 15 +++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/test/cmdline_tests.sh b/test/cmdline_tests.sh index 77762115..b0b751cd 100755 --- a/test/cmdline_tests.sh +++ b/test/cmdline_tests.sh @@ -171,6 +171,15 @@ FILE2=${EXDATADIR}/NA12891_chr21_popstr.sorted.vcf.gz FILE3=${EXDATADIR}/NA12892_chr21_popstr.sorted.vcf.gz runcmd_pass "mergeSTR --vcfs ${FILE1},${FILE2},${FILE3} --out ${TMPDIR}/test_merge_popstr --vcftype popstr" +# Test mergeSTR on a file with list of VCFs +FILE1=${EXDATADIR}/NA12878_chr21_hipstr.sorted.vcf.gz +FILE2=${EXDATADIR}/NA12891_chr21_hipstr.sorted.vcf.gz +FILE3=${EXDATADIR}/NA12892_chr21_hipstr.sorted.vcf.gz +echo ${FILE1} > ${TMPDIR}/vcf.list +echo ${FILE2} >> ${TMPDIR}/vcf.list +echo ${FILE3} >> ${TMPDIR}/vcf.list +runcmd_pass "mergeSTR --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" + runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12878_chr21_advntr.sorted.vcf.gz --out stdout --afreq" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12891_chr21_eh.sorted.vcf.gz --out ${TMPDIR}/stats_eh --numcalled" runcmd_pass "statSTR --vcf ${EXDATADIR}/trio_chr21_gangstr.sorted.vcf.gz --out ${TMPDIR}/stats_gangstr --numcalled --mean" diff --git a/trtools/mergeSTR/README.rst b/trtools/mergeSTR/README.rst index b648a07f..8c00b677 100644 --- a/trtools/mergeSTR/README.rst +++ b/trtools/mergeSTR/README.rst @@ -26,7 +26,8 @@ To run mergeSTR use the following command:: Required Parameters: -* :code:`--vcf `: Comma-separated list of VCF files to merge. All must have been created by the same TR genotyper. Must be bgzipped, sorted, and indexed. (See `Instructions on Compressing and Indexing VCF files`_ below) +* :code:`--vcfs `: Comma-separated list of VCF files to merge. All must have been created by the same TR genotyper. Must be bgzipped, sorted, and indexed. (See `Instructions on Compressing and Indexing VCF files`_ below) +* :code:`--vcfs-list `: As an alternative to :code:`--vcfs`, you can provide a file with a list of bgzipped/sorted/indexed VCF files (one filename per line) to merge. * :code:`--vcftype `: Type of VCF files being merged. Default = :code:`auto`. Must be one of: :code:`gangstr`, :code:`advntr`, :code:`hipstr`, :code:`eh`, :code:`popstr`. * :code:`--out `: prefix to name output files diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index 56b2d319..83224470 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -537,7 +537,10 @@ def getargs() -> Any: # pragma: no cover req_group = parser.add_argument_group("Required arguments") req_group.add_argument("--vcfs", help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", - type=str, required=True) + type=str, required=False) + req_group.add_argument("--vcfs-list", + help="File containing list of VCF files to merge. Must specify either --vcfs or --vcfs-list", + type=str, required=False) req_group.add_argument("--out", help="Prefix to name output files", type=str, required=True) req_group.add_argument("--vcftype", help="Options=%s" % [str(item) for item in trh.VcfTypes.__members__], type=str, default="auto") @@ -579,7 +582,15 @@ def main(args: Any) -> int: "directory".format(args.out)) return 1 - filenames = args.vcfs.split(",") + if args.vcfs is None and args.vcfs_list is None: + common.WARNING("Error: you must specify either --vcfs or --vcfs-list") + return 1 + + if args.vcfs is not None: + filenames = args.vcfs.split(",") + else: + filenames = [item.strip() for item in open(args.vcfs_list, "r").readlines()] + ### Check and Load VCF files ### vcfreaders = utils.LoadReaders(filenames, checkgz=True) if vcfreaders is None: diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index 82d813d8..f9c729d3 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -14,6 +14,7 @@ def args(tmpdir): args = argparse.ArgumentParser() args.vcfs = None + args.vcfs_list = None args.out = str(tmpdir / "test") args.update_sample_from_file = False args.quiet = False @@ -46,6 +47,20 @@ def __init__(self, chrom, pos, ref, alts=None, info=None): self.info = info if info is not None else {} self.vcfrecord = DummyRecord(chrom, pos, ref, self.alt_alleles, self.info) +# Test file with list of VCFs +def test_FileList(args, mrgvcfdir, tmpdir): + fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") + fname2 = os.path.join(mrgvcfdir, "test_file_gangstr2.vcf.gz") + args.vcftype = "gangstr" + listfile = str(tmpdir / "test.list") + f = open(listfile, "w") + f.write(fname1+"\n") + f.write(fname2+"\n") + f.close() + args.vcfs_list = listfile + args.vcfs = None + assert main(args)==0 + # Test right files or directory - GangSTR def test_GangSTRRightFile(args, mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") From f07a99dc51eb3811cd702137fe0c8635b7188ba2 Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 13:15:11 -0800 Subject: [PATCH 34/40] adding extra tests to make sure vcfs and vcfs-list not both specified --- test/cmdline_tests.sh | 1 + trtools/mergeSTR/mergeSTR.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/test/cmdline_tests.sh b/test/cmdline_tests.sh index b0b751cd..a4d58038 100755 --- a/test/cmdline_tests.sh +++ b/test/cmdline_tests.sh @@ -179,6 +179,7 @@ echo ${FILE1} > ${TMPDIR}/vcf.list echo ${FILE2} >> ${TMPDIR}/vcf.list echo ${FILE3} >> ${TMPDIR}/vcf.list runcmd_pass "mergeSTR --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" +runcmd_fail "mergeSTR --vcfs ${FILE1},${FILE2},${FILE3} --vcfs-list ${TMPDIR}/vcf.list --out ${TMPDIR}/test_merge_hipstr_list --vcftype hipstr" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12878_chr21_advntr.sorted.vcf.gz --out stdout --afreq" runcmd_pass "statSTR --vcf ${EXDATADIR}/NA12891_chr21_eh.sorted.vcf.gz --out ${TMPDIR}/stats_eh --numcalled" diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index 83224470..30335d63 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -586,6 +586,10 @@ def main(args: Any) -> int: common.WARNING("Error: you must specify either --vcfs or --vcfs-list") return 1 + if args.vcfs is not None and args.vcfs_list is not None: + common.WARNING("Error: you cannot specify both --vcfs and --vcfs-list") + return 1 + if args.vcfs is not None: filenames = args.vcfs.split(",") else: From b8a9843371c839953cfc391b4af0590d09ed32e9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 3 Jan 2024 23:37:02 +0000 Subject: [PATCH 35/40] try linking directly to changelog from toc --- doc/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/index.rst b/doc/index.rst index f6955932..12cbd8a8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,5 +23,6 @@ Table of Contents UTILITIES CALLERS LIBRARY + `RELEASE_NOTES `_ site_indices From 77cba2304a13609b032c40505135e207f68a775d Mon Sep 17 00:00:00 2001 From: gymreklab Date: Wed, 3 Jan 2024 16:11:00 -0800 Subject: [PATCH 36/40] added test to assert VCF same when reading from filelist --- trtools/mergeSTR/tests/test_mergeSTR.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index f9c729d3..0662339b 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -52,6 +52,17 @@ def test_FileList(args, mrgvcfdir, tmpdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") fname2 = os.path.join(mrgvcfdir, "test_file_gangstr2.vcf.gz") args.vcftype = "gangstr" + + # Run with files input to vcfs + nolist_outfile = str(tmpdir / "test-gangstr") + args.out = nolist_outfile + args.vcfs = fname1 + "," + fname2 + args.vcfs_list = None + assert main(args)==0 + + # Run with files input as list + list_outfile = str(tmpdir / "test-gangstr-list") + args.out = list_outfile listfile = str(tmpdir / "test.list") f = open(listfile, "w") f.write(fname1+"\n") @@ -60,6 +71,7 @@ def test_FileList(args, mrgvcfdir, tmpdir): args.vcfs_list = listfile args.vcfs = None assert main(args)==0 + assert_same_vcf(nolist_outfile + ".vcf", list_outfile + ".vcf") # Test right files or directory - GangSTR def test_GangSTRRightFile(args, mrgvcfdir): From ec80d5d0616321d77bd0971e3d43c05582a18299 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 3 Jan 2024 16:36:12 -0800 Subject: [PATCH 37/40] fix link to changelog in toc --- doc/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index 12cbd8a8..a2725832 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,6 +23,6 @@ Table of Contents UTILITIES CALLERS LIBRARY - `RELEASE_NOTES `_ + RELEASE_NOTES site_indices From 7a33dee4f160febc039f3a4bce0caa71b53a6c2e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 3 Jan 2024 16:43:37 -0800 Subject: [PATCH 38/40] format release notes toc text the way it was before --- doc/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index a2725832..87264af7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,6 +23,6 @@ Table of Contents UTILITIES CALLERS LIBRARY - RELEASE_NOTES + Release Notes site_indices From c921fe426632897580f599cb935cece1615e5145 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 7 Jan 2024 16:16:59 +0000 Subject: [PATCH 39/40] add pr checklist --- .github/pull_request_template.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..6fe53dc9 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,11 @@ +## Checklist + +* [ ] I've checked to ensure there aren't already other open [pull requests](../../../pulls) for the same update/change +* [ ] I've prefixed the title of my PR according to [the conventional commits specification](https://www.conventionalcommits.org/). If your PR fixes a bug, please prefix the PR with `fix: `. Otherwise, if it introduces a new feature, please prefix it with `feat: `. If it introduces a breaking change, please add an exclamation before the colon, like `feat!: `. If the scope of the PR changes because of a revision to it, please update the PR title, since the title will be used in our CHANGELOG. +* [ ] At the top of the PR, I've [listed any open issues that this PR will resolve](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword). For example, "resolves #0" if this PR resolves issue #0 +- [ ] I've explained my changes in a manner that will make it possible for both users and maintainers of TRTools to understand them +* [ ] I've added tests for any new functionality. Or, if this PR fixes a bug, I've added test(s) that replicate it +* [ ] I've updated the relevant REAMDEs with any new usage information and checked that the newly built documentation is formatted properly +* [ ] All functions, modules, classes etc. still conform to [numpy docstring standards](https://numpydoc.readthedocs.io/en/latest/format.html) +* [ ] (if applicable) I've updated the pyproject.toml file with any changes I've made to TRTools's dependencies +* [ ] In the body of this PR, I've included a short address to the reviewer highlighting one or two items that might deserve their focus From 7762d8796144597c4bdb5f68f51436bf14420ef2 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 7 Jan 2024 16:26:30 +0000 Subject: [PATCH 40/40] docs: reviewer and maintainer responsibilities with automated releases and PRs --- PUBLISHING.rst | 7 ++++--- README.rst | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PUBLISHING.rst b/PUBLISHING.rst index 2673db51..c7dffee2 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -27,9 +27,10 @@ Publishing Steps To publish a new version of trtools: -1. First, locate the most recent PR prefixed "chore(main)" created by our Github actions bot -2. List a maintainer of our repository (@gymrek-lab/trtools) as a reviewer of the PR and ask them to merge it -3. The bot will automatically create a new version on PyPI and tag a release on Github +1. First, locate the most recent *release PR* prefixed "chore(main)" created by our Github actions bot +2. List a maintainer of our repository (@gymrek-lab/trtools) as a reviewer of the PR +3. The maintainer may edit the auto-generated text-body of the PR release to incorporate additional details from the underlying PRs, if needed +4. Once merged, the bot will automatically create a new version on PyPI and tag a release on Github Lastly, the change needs to be published to bioconda. diff --git a/README.rst b/README.rst index 4c65fbc7..b0f60a52 100644 --- a/README.rst +++ b/README.rst @@ -172,6 +172,8 @@ We appreciate contributions to TRTools. If you would like to contribute a fix or #. Submit a pull request (PR) **to the master branch** of the central repository with a description of what changes you have made. Title the PR according to the `conventional commits spec `_. A member of the TRTools team will reply and continue the contribution process from there, possibly asking for additional information/effort on your part. + * If you are reviewing a pull request, please double-check that the PR addresses each item in `our PR checklist `_ + Publishing ---------- If you are a TRTools maintainer and wish to publish changes and distribute them to PyPI and bioconda, please see PUBLISHING.rst in the root of the git repo.