stable chr pos list path verify

Plogeur · Jan 10, 2025 · c3d57c4 · c3d57c4
1 parent 2ff494c
commit c3d57c4
Show file tree

Hide file tree

Showing 20 changed files with 13,074 additions and 273 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/report.md b/report.md
@@ -0,0 +1,46 @@
+# Report comparison stoat vs plink
+
+## Intro
+Pour tester les différentes  outils nous réalisons 2 type de simulation : binaire et quantitative.
+
+La simulation binaire contient 100 échantillons dont chacun d’eux présentent 1000 variations, chaque échantillon est simulé pour appartenir à un dès 2 groupes. Les probabilité de passage d’un nœud à un autre dans le graph est connues pour chacun des groupes.
+
+La simulation quantitative contient 200 échantillons pour 1000 variantions, chaque échantillon est simulé avec un phénotype quantitative normalizer en 0, tout comme la simulation binaire les échantillons sont soumis à une appartenance à 2 groupes mais cette dépendante est moduler ici la valeur du phénotype de chaque échantillons. Les probabilités de passage d’un nœud à un autre dans le graph est connues pour chacun des groupes.
+
+Fichiers présents : 
+- VCFs (sortie du pipeline de calling)
+- Probabilité de passage des noeuds
+- Information sur le graph pangénomique (.dist, .pg, .gfa)
+- Phenotype
+
+## Méthodes
+Dans un premier temps nous voulons tester les différents outils sans un preprocessing des données au préalable, pour cela nous mergons les VCFs avec la commande bcftools merge, puis nous lançons les différents outils (stoat/plink) sur le VCF mergé obtenue.  
+
+```bash
+bcftools merge $dir_vcf/*.vcf.gz --threads 6 -m none -Oz -o $output/merged_vcf.vcf
+```
+
+Puis nous avons décidé de réaliser une décomposition des VCFs avec un pipeline en snakemake afin d’enlever les snarl imbriquer qui ne serait pas bien analyser par plink. Dans cette étapes les VCFs seronts normalizer puis merger.
+
+```bash
+snakemake --cores 8 --snakefile Snakefile --configfile config/config.yaml
+```
+
+## Result
+### stoat
+#### Binaire 
+
+
+
+
+#### Quantitatif 
+
+
+
+
+
+### Plink 
+#### Binaire 
+#### Quantitatif 
+
+Interprétation
diff --git a/stoat/__main__.py b/stoat/__main__.py
@@ -3,7 +3,7 @@
 from stoat import snarl_analyser
 from stoat import utils
 from stoat import p_value_analysis
-from stoat import write_position
+#from stoat import write_position
 from stoat import gaf_creator
 import time
 import logging
@@ -20,7 +20,7 @@ def main() :
     parser.add_argument("-n", "--name",type=utils.check_file, help='The input chromosome prefix reference file', required=False)
     parser.add_argument("-t", "--threshold",type=list_snarl_paths.check_threshold, help='Children threshold', required=False)
     parser.add_argument("-v", "--vcf",type=utils.check_format_vcf_file, help="Path to the merged VCF file (.vcf or .vcf.gz)", required=True)
-    parser.add_argument("-r", "--reference", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (only .vcf)", required=False)
+    # parser.add_argument("-r", "--reference", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (only .vcf)", required=False)
     parser.add_argument("-l", "--listpath", type=utils.check_format_list_path, help="Path to the list paths", required=False)
 
     group = parser.add_mutually_exclusive_group(required=True)
@@ -118,7 +118,7 @@ def main() :
     vcf_object = snarl_analyser.SnarlProcessor(args.vcf, list_samples)
     logger.info("Starting fill matrix...")
     vcf_object.fill_matrix()
-    reference_vcf = args.reference if args.reference else args.vcf
+    # reference_vcf = args.reference if args.reference else args.vcf
 
     # Step 3: P-value Analysis (Binary or Quantitative)
     # Handle Binary Analysis
@@ -127,7 +127,7 @@ def main() :
         output_snarl = os.path.join(output_dir, "binary_analysis.tsv")
         logger.info("Binary table creation...")
         vcf_object.binary_table(snarl_paths, pheno, kinship_matrix, covar, gaf, output_snarl)
-        logger.info("Writing position...")
+        # logger.info("Writing position...")
         # write_position.write_pos_snarl(reference_vcf, output_snarl, "binary")
 
         output_manh = os.path.join(output_dir, "manhattan_plot_binary.png")
@@ -147,7 +147,7 @@ def main() :
         output_file = os.path.join(output_dir, "quantitative_analysis.tsv")
         logger.info("Quantitative table creation...")
         vcf_object.quantitative_table(snarl_paths, pheno, kinship_matrix, covar, output_file)
-        logger.info("Writing position...")
+        # logger.info("Writing position...")
         # write_position.write_pos_snarl(reference_vcf, output_file, "quantitatif")
 
         output_manh = os.path.join(output_dir, "manhattan_plot_quantitative.png")

diff --git a/stoat/list_snarl_paths.py b/stoat/list_snarl_paths.py
@@ -1,7 +1,6 @@
 import bdsg # type: ignore
 import argparse
 from stoat import utils
-import re
 import time 
 import os 
 
@@ -130,32 +129,68 @@ def add_to_path(next_child) :
     # from the last thing in the path
     stree.follow_net_edges(path[-1], pg, False, lambda n: add_to_path(n))
 
-def save_snarls(stree, root, pg, reference, pp_overlay):
-
+def save_snarls(stree, root, pg, ref_paths, ppo) :
     snarls = []
-    chr_pos = []
+    snarls_pos = {}
 
-    def save_snarl_tree_node(net):
+    # given a node handle (dist index) return a position on a reference path
+    def get_node_position(node):
+        node_h = stree.get_handle(node, pg)
+        ret_pos = []
 
         def step_callback(step_handle):
             path_handle = pg.get_path_handle_of_step(step_handle)
             path_name = pg.get_path_name(path_handle)
-            if path_name in reference :
-                position = pp_overlay.get_position_of_step(step_handle)
-                chr_pos.append((path_name, position))
+            if path_name in ref_paths:
+                position = ppo.get_position_of_step(step_handle)
+                ret_pos.append(path_name)
+                ret_pos.append(position)
+                return False
             return True
 
-        if stree.is_snarl(net):
-            snarls.append([net, chr_pos[-1][0], chr_pos[-1][1]])
-
-        if stree.is_node(net):
-            handle_t = stree.get_handle(net, pg)
-            pg.for_each_step_on_handle(handle_t, step_callback)
+        pg.for_each_step_on_handle(node_h, step_callback)
+        return (ret_pos)
 
+    def save_snarl_tree_node(net):
+        # check if we can find a position for this snarl on the reference path
+        snarl_pos = get_net_start_position(net)
+        # if we couldn't find a position, use the parent's that we should have
+        # found and saved earlier
+        if len(snarl_pos) == 0:
+            par_net = stree.get_parent(net)
+            snarl_pos = snarls_pos[stree.net_handle_as_string(par_net)]
+        # save this position
+        snarls_pos[stree.net_handle_as_string(net)] = snarl_pos
+        # save snarl
+        if stree.is_snarl(net):
+            snarls.append([net, snarl_pos[0], snarl_pos[1]])
+        # explore children
         if not stree.is_node(net) and not stree.is_sentinel(net):
             stree.for_each_child(net, save_snarl_tree_node)
-        return True 
+        return (True)
 
+    # given a handle for a node, snarl, chain, whatever, return a "start" position
+    def get_net_start_position(net):
+        # if node, look at its position
+        if stree.is_node(net):
+            return (get_node_position(net))
+        # otherwise check boundaries
+        bnode1 = stree.get_bound(net, True, False)
+        bnode1_p = get_node_position(bnode1)
+        bnode2 = stree.get_bound(net, False, False)
+        bnode2_p = get_node_position(bnode2)
+        # if one of the bondaries is not on a reference path, return it
+        if len(bnode1_p) == 0:
+            return (bnode1_p)
+        if len(bnode2_p) == 0:
+            return (bnode2_p)
+        assert bnode1_p[0] == bnode1_p[0], 'boundary nodes on different reference paths'
+        # as "start" position, let's return the smallest position
+        if bnode1_p[1] < bnode2_p[1]:
+            return (bnode1_p)
+        else:
+            return (bnode2_p)
+
     stree.for_each_child(root, save_snarl_tree_node)
     return snarls
 
@@ -200,8 +235,12 @@ def fill_pretty_paths(stree, pg, finished_paths) :
 
             elif stree.is_chain(net) :
                 # if it's a chain, we need to write someting like ">Nl>*>Nr"
-                nodl = stree.get_bound(net, False, True)
-                nodr = stree.get_bound(net, True, False)
+                if stree.starts_at_start(net):
+                    nodl = stree.get_bound(net, False, True)
+                    nodr = stree.get_bound(net, True, False)
+                else:
+                    nodl = stree.get_bound(net, True, True)
+                    nodr = stree.get_bound(net, False, False)
                 ppath.addNodeHandle(nodl, stree)
                 ppath.addNode('*', '>')
                 ppath.addNodeHandle(nodr, stree)
@@ -217,10 +256,6 @@ def fill_pretty_paths(stree, pg, finished_paths) :
     assert len(type_variants) == len(pretty_paths)
     return pretty_paths, type_variants
 
-def write_output_not_analyse(output_file, snarl_id, reason) :
-    with open(output_file, 'a') as outf:
-        outf.write('{}\t{}\n'.format(snarl_id, reason))
-
 def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, children_treshold=50, bool_return=True) :
 
     with open(output_file, 'w') as out_snarl, open(output_snarl_not_analyse, 'w') as out_fail:
@@ -237,8 +272,9 @@ def count_children(net):
             return (True)
 
         # for each snarl, lists paths through the netgraph and write to output TSV
-        for snarl, chr, pos in snarls:
-
+        for snarl_path_pos in snarls:
+
+            snarl = snarl_path_pos[0]
             snarl_time = time.time()
             snarl_id = find_snarl_id(stree, snarl)
             not_break = True
@@ -267,10 +303,10 @@ def count_children(net):
 
                 # prepare path list to output and write each path directly to the file
                 pretty_paths, type_variants = fill_pretty_paths(stree, pg, finished_paths)
-                out_snarl.write('{}\t{}\t{}\t{}\t{}\n'.format(snarl_id, ','.join(pretty_paths), ','.join(type_variants), chr, pos))
+                out_snarl.write('{}\t{}\t{}\t{}\t{}\n'.format(snarl_id, ','.join(pretty_paths), ','.join(type_variants), snarl_path_pos[1], snarl_path_pos[2]))
 
                 if bool_return :
-                    snarl_paths.append((snarl_id, pretty_paths, ','.join(type_variants), chr, pos))
+                    snarl_paths.append((snarl_id, pretty_paths, ','.join(type_variants), snarl_path_pos[1], snarl_path_pos[2]))
 
                 paths_number_analysis += len(pretty_paths)
 
@@ -291,8 +327,8 @@ def count_children(net):
     os.makedirs(output_dir, exist_ok=True)
     output = os.path.join(output_dir, "list_snarl_paths.tsv")
     output_snarl_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")
-
     stree, pg, root, pp_overlay = parse_graph_tree(args.p, args.d)
+
     snarls = save_snarls(stree, root, pg, reference, pp_overlay)
     print(f"Total of snarls found : {len(snarls)}")
     print("Saving snarl path decomposition...")
@@ -301,7 +337,11 @@ def count_children(net):
     _, paths_number_analysis = loop_over_snarls_write(stree, snarls, pg, output, output_snarl_not_analyse, threshold, False)
     print(f"Total of paths analyse : {paths_number_analysis}")
 
-    # python3 stoat/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test/test_list_snarl.tsv
+    # python3 stoat/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test
     # vg find -x ../snarl_data/fly.gbz -r 5176878:5176884 -c 10 | vg view -dp - | dot -Tsvg -o ../snarl_data/subgraph.svg
 
-    # python3 stoat/list_snarl_paths.py -p tests/simulation/binary_data/pg.full.pg -d tests/simulation/binary_data/pg.dist -c tests/simulation/binary_data/pg.chromosome -o output/test/test_list_snarl.tsv
+    # binary 
+    # python3 stoat/list_snarl_paths.py -p tests/simulation/binary_data/pg.full.pg -d tests/simulation/binary_data/pg.dist -c tests/simulation/binary_data/pg.chromosome -o output/test
+
+    # quantitative
+    # python3 stoat/list_snarl_paths.py -p tests/simulation/quantitative_data/pg.pg -d tests/simulation/quantitative_data/pg.dist -c tests/simulation/quantitative_data/pg.chromosome -o output/test
diff --git a/stoat/snarl_analyser.py b/stoat/snarl_analyser.py
@@ -148,7 +148,7 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat
         """
 
         common_headers = (
-            "CHR\tPOS\tSNARL\tTYPE\tREF\tALT\tP_FISHER\tP_CHI2\tALLELE_NUM\tMIN_ROW_INDEX\tNUM_COLUM\tINTER_GROUP\tAVERAGE"
+            "CHR\tPOS\tSNARL\tTYPE\tP_FISHER\tP_CHI2\tALLELE_NUM\tMIN_ROW_INDEX\tNUM_COLUM\tINTER_GROUP\tAVERAGE"
         )
         headers = f"{common_headers}\tGROUP_PATHS\n" if gaf else f"{common_headers}\n"
 
@@ -162,9 +162,8 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat
 
                 # Perform statistical tests and compute descriptive statistics
                 fisher_p_value, chi2_p_value, allele_number, min_sample, numb_colum, inter_group, average, group_paths = self.binary_stat_test(df, gaf)
-                ref = alt = "NA"
                 common_data = (
-                f"{chromosome}\t{position}\t{snarl}\t{type_var}\t{ref}\t{alt}\t"
+                f"{chromosome}\t{position}\t{snarl}\t{type_var}\t"
                 f"{fisher_p_value}\t{chi2_p_value}\t{allele_number}\t{min_sample}\t"
                 f"{numb_colum}\t{inter_group}\t{average}")
                 data = f"{common_data}\t{group_paths}\n" if gaf else f"{common_data}\n"
@@ -174,14 +173,13 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat
     def quantitative_table(self, snarls:list, quantitative_dict:dict, kinship_matrix:pd.DataFrame=None, covar:Optional[dict]=None, output:str="output/quantitative_output.tsv") :
 
         with open(output, 'wb') as outf:
-            headers = 'CHR\tPOS\tSNARL\tTYPE\tREF\tALT\tRSQUARED\tBETA\tSE\tP\tALLELE_NUM\n'
+            headers = 'CHR\tPOS\tSNARL\tTYPE\tRSQUARED\tBETA\tSE\tP\tALLELE_NUM\n'
             outf.write(headers.encode('utf-8'))
             for snarl_info in snarls:
                 snarl, list_snarl, type_var, chromosome, position = snarl_info
                 df, allele_number = self.create_quantitative_table(list_snarl)
                 rsquared, beta, se, pvalue = self.linear_regression(df, quantitative_dict)
-                ref = alt = "NA"
-                data = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chromosome, position, snarl, type_var, ref, alt, rsquared, beta, se, pvalue, allele_number)
+                data = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chromosome, position, snarl, type_var, rsquared, beta, se, pvalue, allele_number)
                 outf.write(data.encode('utf-8'))
 
     def identify_correct_path(self, decomposed_snarl:list, idx_srr_save:list) -> list:

diff --git a/tests/.DS_Store b/tests/.DS_Store
diff --git a/tests/match_position.py b/tests/match_position.py
@@ -15,7 +15,6 @@ def load_file(file, threshold=None) :
     total_succes = 0  # Track total significant entries
     total_fail = 0
     with open(file, 'r') as f:
-        #chrom_idx, pos_idx, pvalue_idx = identify_header(f)
         chrom_idx, pos_idx, pvalue_idx = identify_header(f)
 
         for line in f: