add list paths stoat starting

Plogeur · Dec 17, 2024 · 9a267ba · 9a267ba
1 parent ed79cc9
commit 9a267ba
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 89 deletions.
diff --git a/src/list_snarl_paths.py b/src/list_snarl_paths.py
@@ -1,5 +1,6 @@
 import bdsg
 import argparse
+import utils
 import re
 from collections import defaultdict
 import time 
@@ -223,13 +224,13 @@ def write_output_not_analyse(output_file, snarl_id, reason) :
     with open(output_file, 'a') as outf:
         outf.write('{}\t{}\n'.format(snarl_id, reason))
 
-def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, time_threshold=10) :
+def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, time_threshold=10, bool_return=True) :
 
     write_header_output(output_file)
     write_header_output_not_analyse(output_snarl_not_analyse)
 
     snarl_paths = defaultdict(list)
-    snarl_number_analysis = 0
+    paths_number_analysis = 0
 
     # for each snarl, lists paths through the netgraph and write to output TSV
     for snarl in snarls:
@@ -254,73 +255,31 @@ def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_anal
         pretty_paths, type_variants = fill_pretty_paths(stree, pg, finished_paths)
 
         write_output(output_file, snarl_id, pretty_paths, type_variants)
-        snarl_paths[snarl_id].extend(pretty_paths)
-        snarl_number_analysis += len(pretty_paths)
 
-    return snarl_paths, snarl_number_analysis
+        if bool_return :
+            snarl_paths[snarl_id].extend(pretty_paths)
+            paths_number_analysis += len(pretty_paths)
 
-def loop_over_snarls(stree, snarls, pg, output_snarl_not_analyse, threshold=50) :
-
-    write_header_output_not_analyse(output_snarl_not_analyse)
-
-    snarl_paths = defaultdict(list)
-    snarl_number_analysis = 0
-
-    children = [0]
-    def count_children(net):
-        children[0] += 1
-        return (True)
-
-    # for each snarl, lists paths through the netgraph and write to output TSV
-    for snarl in snarls:
-
-        children = [0]
-        snarl_id = find_snarl_id(stree, snarl)
-
-        stree.for_each_child(snarl, count_children)
-        if children[0] > threshold :
-            write_output_not_analyse(output_snarl_not_analyse, snarl_id, "too_many_children")
-            continue
-
-        # we'll traverse the netgraph starting at the left boundary
-        # init unfinished paths to the first boundary node
-        paths = [[stree.get_bound(snarl, False, True)]]
-        finished_paths = []
-        while len(paths) > 0 :
-
-            path = paths.pop()
-
-            if len(finished_paths) > 10000 :
-                write_output_not_analyse(output_snarl_not_analyse, snarl_id, "number_of_paths_to_hight")
-                break
-
-            follow_edges(stree, finished_paths, path, paths, pg)
-
-        # prepare path list to output and write each path directly to the file
-        pretty_paths = fill_pretty_paths(stree, pg, finished_paths)
-        snarl_paths[snarl_id].extend(pretty_paths)
-        snarl_number_analysis += len(pretty_paths)
-
-    return snarl_paths, snarl_number_analysis
+    return snarl_paths, paths_number_analysis
 
 if __name__ == "__main__" :
 
     parser = argparse.ArgumentParser('List path through the netgraph of each snarl in a pangenome')
-    parser.add_argument('-p', help='the input pangenome .pg file', required=True)
-    parser.add_argument('-d', help='the input distance index .dist file',required=True)
-    parser.add_argument('-t', type=check_threshold, help='time calculation threshold', required=False)
+    parser.add_argument('-p', type=utils.check_file, help='The input pangenome .pg file', required=True)
+    parser.add_argument('-d', type=utils.check_file, help='The input distance index .dist file', required=True)
+    parser.add_argument("-t", type=check_threshold, help='Children threshold', required=False)
     parser.add_argument('-o', help='the output TSV file', type=str, required=True)
     args = parser.parse_args()
 
     stree, pg, root = parse_graph_tree(args.p, args.d)
     snarls = save_snarls(stree, root)
     print(f"Total of snarls found : {len(snarls)}")
     print("Saving snarl path decomposition...")
-
     output_snarl_not_analyse = "snarl_not_analyse.tsv"
 
     threshold = args.t if args.t else 10
-    loop_over_snarls_write(stree, snarls, pg, args.o, output_snarl_not_analyse, threshold)
+    _, paths_number_analysis = loop_over_snarls_write(stree, snarls, pg, args.o, output_snarl_not_analyse, threshold, False)
+    print(f"Total of paths analyse : {paths_number_analysis}")
 
     # python3 src/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test/test_list_snarl.tsv
     # vg find -x ../snarl_data/fly.gbz -r 5176878:5176884 -c 10 | vg view -dp - | dot -Tsvg -o ../snarl_data/subgraph.svg

diff --git a/src/stoat.py b/src/stoat.py
@@ -20,6 +20,7 @@ def main() :
     parser.add_argument("-t", type=list_snarl_paths.check_threshold, help='Children threshold', required=False)
     parser.add_argument("-v", type=utils.check_format_vcf_file, help="Path to the merged VCF file (.vcf or .vcf.gz)", required=True)
     parser.add_argument("-r", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (.vcf or .vcf.gz)", required=False)
+    parser.add_argument("--listpath", type=utils.check_format_list_path, help="Path to the list paths", required=False)
 
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument("-b", "--binary", type=utils.check_format_pheno, help="Path to the binary group file (.txt or .tsv)")
@@ -74,21 +75,28 @@ def main() :
         pheno = utils.parse_pheno_quantitatif_file(args.quantitative)
         utils.check_mathing(pheno, list_samples, "phenotype")
 
-    # Step 1: Parse the Pangenome Graph and Create Snarl Paths to Test
-    start_time = time.time()
-    logger.info("Starting snarl path decomposition...")
-    stree, pg, root = list_snarl_paths.parse_graph_tree(args.p, args.d)
-    snarls = list_snarl_paths.save_snarls(stree, root)
-    logger.info(f"Total of snarls found : {len(snarls)}")
-    logger.info("Saving snarl path decomposition...")
+    if not parser.listpath : 
+        # Step 1: Parse the Pangenome Graph and Create Snarl Paths to Test
+        start_time = time.time()
+        logger.info("Starting snarl path decomposition...")
+        stree, pg, root = list_snarl_paths.parse_graph_tree(args.p, args.d)
+        snarls = list_snarl_paths.save_snarls(stree, root)
+        logger.info(f"Total of snarls found : {len(snarls)}")
+        logger.info("Saving snarl path decomposition...")
 
-    output_snarl_path_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")
-    output_snarl_path = os.path.join(output_dir, "snarl_paths.tsv")
+        output_snarl_path_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")
+        output_snarl_path = os.path.join(output_dir, "snarl_paths.tsv")
 
-    threshold = int(args.t) if args.t else 10 
-    snarl_paths, snarl_number_analysis = list_snarl_paths.loop_over_snarls_write(stree, snarls, pg, output_snarl_path, output_snarl_path_not_analyse, threshold)
+        threshold = int(args.t) if args.t else 10 
+        snarl_paths, paths_number_analysis = list_snarl_paths.loop_over_snarls_write(stree, snarls, pg, output_snarl_path, output_snarl_path_not_analyse, threshold)
+        logger.info(f"Total of paths analyse : {paths_number_analysis}")
 
-    logger.info(f"Total of paths analyse : {snarl_number_analysis}")
+    else :
+        if parser.p or parser.d : 
+            logger.info("list snarls path are provided, .pg and .dist will be not analyse")
+        input_snarl_path = parser.listpath
+        snarl_paths, paths_number_analysis = utils.parse_snarl_path_file(input_snarl_path)
+        logger.info(f"Total of snarls found : {paths_number_analysis}")
 
     # Step 2: Parse VCF Files and Fill the Matrix
     vcf_object = snarl_analyser.SnarlProcessor(args.v, list_samples)
@@ -116,9 +124,8 @@ def main() :
         p_value_analysis.qq_plot_binary(output_snarl, output_qq)
         p_value_analysis.plot_manhattan_binary(output_snarl, output_manh)
         if gaf :
-            snarl_path_dic = utils.parse_snarl_path_file(output_snarl_path)
             output_gaf = os.path.join(output_dir, "group_paths.gaf")
-            gaf_creator.parse_input_file(output_snarl, snarl_path_dic, pg, output_gaf)
+            gaf_creator.parse_input_file(output_snarl, snarl_paths, pg, output_gaf)
 
     # Handle Quantitative Analysis
     elif args.quantitative:

diff --git a/src/utils.py b/src/utils.py
@@ -40,17 +40,18 @@ def parse_snarl_path_file(path_file: str) -> dict:
 
     # Initialize an empty dictionary for the snarl paths
     snarl_paths = defaultdict(list)
+    snarl_number_analysis = 0
 
     # Read the file into a pandas DataFrame
     df = pd.read_csv(path_file, sep='\t', dtype=str)
-    df = df[df['paths'].notna()]
     df['paths'] = df['paths'].str.split(',')
 
     # Create the dictionary with snarl as key and paths as values
     for snarl, paths in zip(df['snarl'], df['paths']):
         snarl_paths[snarl].extend(paths)
+        snarl_number_analysis += 1
 
-    return snarl_paths
+    return snarl_paths, snarl_number_analysis
 
 def parse_covariate_file(covar_path : str) -> dict :
 
@@ -74,38 +75,72 @@ def check_mathing(elements, list_samples, file_name) :
     if missing_elements :
         raise ValueError(f"The following sample name from vcf are not present in {file_name} file : {missing_elements}")
 
-def check_format_vcf_file(file_path : str) -> str:
+def check_format_list_path(file_path : str) -> str:
     """
-    Function to check if the provided file path is a valid VCF file.
+    Function to check if the provided file path is a valid list path file.
     """
     check_file(file_path)
 
-    if not file_path.lower().endswith('.vcf') and not file_path.lower().endswith('.vcf.gz'):
-        raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid VCF file. It must have a .vcf extension or .vcf.gz.")
+    with open(file_path, 'r') as file:
+        # Read and validate the header
+        first_line = file.readline().strip()
+        expected_header = 'snarl\tpaths\ttype'
+        if first_line != expected_header:
+            raise argparse.ArgumentTypeError(
+                f"The file must start with the following header: '{expected_header}' and be split by tabulation"
+            )
+
+        # Validate all other lines
+        for line_number, line in enumerate(file, start=2):  # Start at 2 for line number after header
+            columns = line.strip().split('\t')
+            if len(columns) != 3:
+                raise argparse.ArgumentTypeError(
+                    f"Line {line_number} must contain exactly 3 columns, but {len(columns)} columns were found."
+                )
+            if not all(isinstance(col, str) and col.strip() for col in columns):
+                raise argparse.ArgumentTypeError(
+                    f"Line {line_number} contains empty or non-string values: {columns}"
+                )
+
     return file_path
 
-def check_format_group_snarl(file_path : str) -> str :
+def check_format_vcf_file(file_path : str) -> str:
     """
-    Function to check if the provided file path is a valid group file.
+    Function to check if the provided file path is a valid VCF file.
     """
     check_file(file_path)
-
-    if not file_path.lower().endswith('.txt') and not file_path.lower().endswith('.tsv'):
-        raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid group/snarl file. It must have a .txt extension or .tsv.")
-    return file_path
 
-def check_format_pheno(file_path : str) -> str :
+    if not file_path.lower().endswith('.') and not file_path.lower().endswith('.vcf.gz'):
+        raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid VCF file. It must have a .vcf extension or .vcf.gz.")
+    return file_path
 
+def check_format_pheno(file_path: str) -> str:
+
     check_file(file_path)
 
     with open(file_path, 'r') as file:
         first_line = file.readline().strip()
-
-    header = first_line.split('\t')
-    expected_header = ['FID', 'IID', 'PHENO']
-    if header != expected_header:
-        raise argparse.ArgumentTypeError(f"The file must contain the following headers: {expected_header} and be split by tabulation")
-
+        header = first_line.split('\t')
+        expected_header = ['FID', 'IID', 'PHENO']
+        if header != expected_header:
+            raise argparse.ArgumentTypeError(
+                f"The file must contain the following headers: {expected_header} and be split by tabulation"
+            )
+
+        # Validate all other lines
+        for line_number, line in enumerate(file, start=2):  # Start at 2 for line number after header
+            columns = line.strip().split('\t')
+            if len(columns) != 3:
+                raise argparse.ArgumentTypeError(
+                    f"Line {line_number} must contain exactly 3 columns, but {len(columns)} columns were found."
+                )
+            try:
+                float(columns[2])  # Check if the third column is a float or can be converted to one
+            except ValueError:
+                raise argparse.ArgumentTypeError(
+                    f"Line {line_number} contains a non-numeric value in the last column: {columns[2]}"
+                )
+
     return file_path
 
 def check_covariate_file(file_path):

diff --git a/tests/other_files/test_snarl_path.tsv b/tests/other_files/test_snarl_path.tsv
@@ -1,3 +1,3 @@
-snarl	paths
->1>3	>1>2>7
->2>3	>1>2>3>5>6,>1>2>4>6
+snarl	paths	type
+>1>3	>1>2>7	SNP
+>2>3	>1>2>3>5>6,>1>2>4>6	DEL