From 35d1b150249bad3f6b45e9e563ec874fd3e5911f Mon Sep 17 00:00:00 2001 From: Shriram Bhosle Date: Fri, 23 Dec 2022 15:23:38 +0000 Subject: [PATCH 1/2] added functionality to analyse long read data --- CHANGES.md | 5 ++ Dockerfile | 2 +- README.md | 8 +++ analyse_ta/commandline.py | 87 ++++++++++++++++++---- analyse_ta/process_bedcov.py | 135 ++++++++++++++++++++++++++++++----- requirements.txt | 4 +- setup.py | 2 +- test/test_ta.py | 17 ++++- 8 files changed, 223 insertions(+), 37 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e1cc315..9bc4c14 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,9 @@ +## 1.1.0 +* Added option to analyse long read data +* Fixed bug that added 1 to total fragment length, this will slightly increase the fpbm_br and fpbm_nbr values +* added option to add header line to output + ## 1.0.0 * First release to calculate TA repeats FPBM values using samtools bedcoverage data diff --git a/Dockerfile b/Dockerfile index 733e73e..cd1808c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ USER root MAINTAINER cgphelp@sanger.ac.uk -ENV ANALYSE_TA_VER '1.0.0' +ENV ANALYSE_TA_VER '1.1.0' # install system tools RUN apt-get -yq update diff --git a/README.md b/README.md index 3c52350..41b8c6f 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,14 @@ Various exceptions can occur for malformed input files. * ```test_sample 6.12 15.8``` command line output . +### outputFormat with dnovo flag set +* ```Applicable only to Long Read sequencing data ``` +Use of dnovo flag will classify the TA repeat regions into broken and non-broken based on the user defined dnovo_cutoff parmater. +This is applicable for long read data where average length of TA repeat is known for each interval +* ```br:broken``` +* ```nbr:non_broken``` +* ```test_sample 6.12 15.8``` command line output . + ## INSTALL Installing via `pip install`. Simply execute with the path to the compiled 'whl' found on the [release page][analyse_ta-releases]: diff --git a/analyse_ta/commandline.py b/analyse_ta/commandline.py index 33ea839..26fbef6 100644 --- a/analyse_ta/commandline.py +++ b/analyse_ta/commandline.py @@ -3,6 +3,7 @@ import os import argparse import pkg_resources + # load config and reference files.... version = pkg_resources.require("analyse_ta")[0].version @@ -11,20 +12,80 @@ def main(): # pragma: no cover <-- usage = "\n %prog [options] -br input_br.bedcov -nbr input_nbr.bedcov -s " - optParser = argparse.ArgumentParser(prog='analyse_ta', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + optParser = argparse.ArgumentParser( + prog="analyse_ta", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) optional = optParser._action_groups.pop() - required = optParser.add_argument_group('required arguments') + required = optParser.add_argument_group("required arguments") + + required.add_argument( + "-br", + "--file_br", + type=str, + dest="file_br", + required=True, + default=None, + help="broken ta repeat bed coverage file", + ) + required.add_argument( + "-nbr", + "--file_nbr", + type=str, + dest="file_nbr", + required=True, + default=None, + help="non broken ta repeat bed coverage file", + ) + + optional.add_argument( + "-s", + "--sample_name", + type=str, + dest="sample_name", + required=False, + default="test_sample", + help="sample name", + ) + + optional.add_argument( + "-dn", + "--dnovo", + action="store_true", + dest="dnovo", + default=False, + help="set flag to analyse dnovo long read data", + ) + + optional.add_argument( + "-ah", + "--add_header", + action="store_true", + dest="add_header", + default=False, + help="set flag to add_header line, useful in batch mode to set for first sample", + ) - required.add_argument("-br", "--file_br", type=str, dest="file_br", required=True, - default=None, help="broken ta repeat bed coverage file") - required.add_argument("-nbr", "--file_nbr", type=str, dest="file_nbr", required=True, - default=None, help="non broken ta repeat bed coverage file") + optional.add_argument( + "-dn_cutoff", + "--dnovo_cutoff", + type=int, + dest="dnovo_cutoff", + required=False, + default=2, + help="cut off length ratio with ref interval to flag as broken region", + ) - optional.add_argument("-s", "--sample_name", type=str, dest="sample_name", required=False, - default='test_sample', help="sample name") - optional.add_argument("-v", "--version", action='version', version='%(prog)s ' + version) - optional.add_argument("-q", "--quiet", action="store_false", dest="verbose", required=False, default=True) + optional.add_argument( + "-v", "--version", action="version", version="%(prog)s " + version + ) + optional.add_argument( + "-q", + "--quiet", + action="store_false", + dest="verbose", + required=False, + default=True, + ) optParser._action_groups.append(optional) if len(sys.argv) == 0: @@ -32,11 +93,11 @@ def main(): # pragma: no cover <-- sys.exit(1) opts = optParser.parse_args() if not opts.file_nbr or not opts.file_br: - sys.exit('\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n') + sys.exit("\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n") # vars function returns __dict__ of Namespace instance processed = processcov.processBedCov(**vars(opts)) print(processed.results) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/analyse_ta/process_bedcov.py b/analyse_ta/process_bedcov.py index 35d6417..ebbee9a 100644 --- a/analyse_ta/process_bedcov.py +++ b/analyse_ta/process_bedcov.py @@ -1,44 +1,141 @@ import os import sys import pandas as pd +import numpy as np -''' +""" This class claculates mean coverage for broken and non_broken TA reapeat depth output fron samtools bedcov -''' +""" class processBedCov: """ - Main class , loads user defined parameters and files + Main class , loads user defined parameters and files """ def __init__(self, **kwargs): - self.br_file = kwargs['file_br'] - self.nbr_file = kwargs['file_nbr'] - self.sample = kwargs.get('sample_name', 'test_sample') + self.br_file = kwargs["file_br"] + self.nbr_file = kwargs["file_nbr"] + self.dnovo = kwargs["dnovo"] + self.add_header = kwargs["add_header"] + self.dnovo_cutoff = kwargs["dnovo_cutoff"] + self.sample = kwargs.get("sample_name", "test_sample") # check input data ... self.results = self.process() def process(self): - mydf_br = create_df_to_merge(self.br_file, 'br') - mydf_nbr = create_df_to_merge(self.nbr_file, 'nbr') + mydf_br = create_df_to_merge(self.br_file, "br", self.dnovo_cutoff, self.dnovo) + mydf_nbr = create_df_to_merge( + self.nbr_file, "nbr", self.dnovo_cutoff, self.dnovo + ) merged_df = pd.concat([mydf_nbr, mydf_br]) - mean_fpmb_const = merged_df['frl'].sum(axis=0) - merged_df['fpbm'] = merged_df['frl'] / mean_fpmb_const * (10**6) - br_mean = merged_df.loc[merged_df.ta_type == 'br'] - nbr_mean = merged_df.loc[merged_df.ta_type == 'nbr'] - return f"{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}" + mean_fpmb_const = merged_df["frl"].sum(axis=0) + merged_df["fpbm"] = merged_df["frl"] / mean_fpmb_const * (10**6) + br_mean = merged_df.loc[merged_df.ta_type == "br"] + nbr_mean = merged_df.loc[merged_df.ta_type == "nbr"] + header = f"sample\tfpbm_br\tfpbm_nbr\n" + output = "" + if self.add_header: + output = header + if self.dnovo: + if self.add_header: + output = output.strip() + output = (f"{output}\tref_br\tref_nbr\tmean_fpbm_dnovo_br\tmean_fpbm_dnovo_br\tdnovo_br\tdnovo_nbr" + f"\tdnovo_in_ref_br\tdnovo_in_ref_nbr\tcumulative_fpbm_br" + f"\tcumulative_fpbm_nbr\tjaccard_br\tjaccard_nbr\n") + br_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "br"] + nbr_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "nbr"] + br_cumulative_mean = merged_df.loc[ + (merged_df["ta_type_dnovo"] == "br") | (merged_df["ta_type"] == "br") + ] + nbr_cumulative_mean = merged_df.loc[ + (merged_df["ta_type_dnovo"] == "nbr") | (merged_df["ta_type"] == "nbr") + ] + num_br = len(merged_df[merged_df["ta_type"] == "br"]) + num_nbr = len(merged_df[merged_df["ta_type"] == "nbr"]) + num_br_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "br"]) + num_nbr_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "nbr"]) + br_m11 = len( + merged_df[ + (merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] == "br") + ] + ) + br_m01 = len( + merged_df[ + (merged_df["ta_type_dnovo"] != "br") & (merged_df["ta_type"] == "br") + ] + ) + br_m10 = len( + merged_df[ + (merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] != "br") + ] + ) + nbr_m11 = len( + merged_df[ + (merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] == "nbr") + ] + ) + nbr_m01 = len( + merged_df[ + (merged_df["ta_type_dnovo"] != "nbr") & (merged_df["ta_type"] == "nbr") + ] + ) + nbr_m10 = len( + merged_df[ + (merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] != "nbr") + ] + ) + jindex_br = (br_m11 / (br_m01 + br_m10 + br_m11)) * 100 + jindex_nbr = (nbr_m11 / (nbr_m01 + nbr_m10 + nbr_m11)) * 100 -def create_df_to_merge(infile, ta_type): + return ( + f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}" + f"\t{nbr_mean['fpbm'].mean(axis=0):.2f}\t{num_br}\t{num_nbr}" + f"\t{br_mean_dnovo['fpbm'].mean(axis=0):.2f}\t{nbr_mean_dnovo['fpbm'].mean(axis=0):.2f}" + f"\t{num_br_dnovo}\t{num_nbr_dnovo}\t{br_m11}\t{nbr_m11}" + f"\t{br_cumulative_mean['fpbm'].mean(axis=0):.2f}" + f"\t{nbr_cumulative_mean['fpbm'].mean(axis=0):.2f}\t{jindex_br:.2f}\t{jindex_nbr:.2f}" + ) + + output = f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}" + return output + + +def create_df_to_merge(infile, ta_type, dnovo_cutoff, dnovo=None): """ - create pandas data frame + create pandas data frame """ if not os.path.isfile(infile): + print(f"File not found {infile}") return None - df = pd.read_csv(infile, compression='infer', sep="\t", low_memory=False, - header=None, names=['chr', 'start', 'end', 'coverage']) - df['frl'] = df['coverage'] / (df['end'] - df['start']) + 1 - df['ta_type'] = ta_type + df = pd.read_csv( + infile, + compression="infer", + sep="\t", + low_memory=False, + header=None, + names=["chr", "start", "end", "coverage"], + ) + + if dnovo: + df["frl"] = (df["coverage"] * 2) / ((df["end"] - df["start"]) + 1) + df["ta_type_dnovo"] = np.select( + [df["frl"] <= dnovo_cutoff, df["frl"] > dnovo_cutoff], ["nbr", "br"] + ) + df["ta_type"] = ta_type + + else: + df["frl"] = df["coverage"] / ((df["end"] - df["start"]) + 1) + df["ta_type"] = ta_type return df + + +def _print_df(mydf, out_file): + if out_file: + mydf.to_csv( + out_file, sep="\t", mode="w", header=True, index=True, doublequote=False + ) + else: + sys.exit("Outfile not provided") diff --git a/requirements.txt b/requirements.txt index d5356a6..ff1060a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,9 @@ future==0.18.2 importlib-metadata==4.6.3 iniconfig==1.1.1 mando==0.6.4 -numpy==1.21.1 +numpy packaging==21.0 -pandas==1.3.1 +pandas pluggy==0.13.1 py==1.10.0 pyparsing==2.4.7 diff --git a/setup.py b/setup.py index 2b4bd72..5ab51c0 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup config = { - 'version': '1.0.0', + 'version': '1.1.0', 'name': 'analyse_ta', 'description': 'Tool to analyse TA repeats bed coverage...', 'author': 'Shriram Bhosle', diff --git a/test/test_ta.py b/test/test_ta.py index 84a2d17..f8f2d3f 100755 --- a/test/test_ta.py +++ b/test/test_ta.py @@ -13,11 +13,26 @@ class TestClass(): test_dir = configdir + '/data/' options = {'file_br': test_dir + 'test_br.bedcov', 'file_nbr': test_dir + 'test_nbr.bedcov', + 'dnovo':False, + 'add_header':True, + 'dnovo_cutoff':2, + 'sample_name': 'my_test_sample', + } + options_ta_ext = {'file_br': test_dir + 'test_br.bedcov', + 'file_nbr': test_dir + 'test_nbr.bedcov', + 'dnovo':True, + 'add_header':False, + 'dnovo_cutoff':10, 'sample_name': 'my_test_sample', } processed=processcov.processBedCov(**options) + processed_ta_ext=processcov.processBedCov(**options_ta_ext) # celline output def test_bedcov(self): f=self.processed - assert f.results == "my_test_sample\t35.23\t80.85" + assert f.results == "sample\tfpbm_br\tfpbm_nbr\nmy_test_sample\t33.04\t82.05" + + def test_ta_ext(self): + f_ext=self.processed_ta_ext + assert f_ext.results == "my_test_sample\t33.04\t82.05\t5434\t10000\t79.39\t14.36\t11970\t3464\t2776\t806\t67.53\t67.78\t18.98\t6.37" From 76809dba40f123e677f6018362ccab82cda297c7 Mon Sep 17 00:00:00 2001 From: Shriram Bhosle Date: Fri, 23 Dec 2022 15:27:53 +0000 Subject: [PATCH 2/2] updated requirements and Docker file version --- Dockerfile | 2 +- requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index cd1808c..947b6b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,7 +40,7 @@ RUN pip3 install --install-option="--prefix=$CGP_OPT/python-lib" dist/$(ls -1 di FROM ubuntu:20.04 LABEL uk.ac.sanger.cgp="Cancer Genome Project, Wellcome Sanger Institute" \ - version="1.0.0" \ + version="1.1.0" \ description="Tool to perform TA repeat bed coverage analysis" ### security upgrades and cleanup diff --git a/requirements.txt b/requirements.txt index ff1060a..d5356a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,9 @@ future==0.18.2 importlib-metadata==4.6.3 iniconfig==1.1.1 mando==0.6.4 -numpy +numpy==1.21.1 packaging==21.0 -pandas +pandas==1.3.1 pluggy==0.13.1 py==1.10.0 pyparsing==2.4.7