Skip to content

Commit

Permalink
stable chr pos list path verify
Browse files Browse the repository at this point in the history
  • Loading branch information
Plogeur committed Jan 10, 2025
1 parent 2ff494c commit c3d57c4
Show file tree
Hide file tree
Showing 20 changed files with 13,074 additions and 273 deletions.
Binary file modified .DS_Store
Binary file not shown.
46 changes: 46 additions & 0 deletions report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Report comparison stoat vs plink

## Intro
Pour tester les différentes outils nous réalisons 2 type de simulation : binaire et quantitative.

La simulation binaire contient 100 échantillons dont chacun d’eux présentent 1000 variations, chaque échantillon est simulé pour appartenir à un dès 2 groupes. Les probabilité de passage d’un nœud à un autre dans le graph est connues pour chacun des groupes.

La simulation quantitative contient 200 échantillons pour 1000 variantions, chaque échantillon est simulé avec un phénotype quantitative normalizer en 0, tout comme la simulation binaire les échantillons sont soumis à une appartenance à 2 groupes mais cette dépendante est moduler ici la valeur du phénotype de chaque échantillons. Les probabilités de passage d’un nœud à un autre dans le graph est connues pour chacun des groupes.

Fichiers présents :
- VCFs (sortie du pipeline de calling)
- Probabilité de passage des noeuds
- Information sur le graph pangénomique (.dist, .pg, .gfa)
- Phenotype

## Méthodes
Dans un premier temps nous voulons tester les différents outils sans un preprocessing des données au préalable, pour cela nous mergons les VCFs avec la commande bcftools merge, puis nous lançons les différents outils (stoat/plink) sur le VCF mergé obtenue.

```bash
bcftools merge $dir_vcf/*.vcf.gz --threads 6 -m none -Oz -o $output/merged_vcf.vcf
```

Puis nous avons décidé de réaliser une décomposition des VCFs avec un pipeline en snakemake afin d’enlever les snarl imbriquer qui ne serait pas bien analyser par plink. Dans cette étapes les VCFs seronts normalizer puis merger.

```bash
snakemake --cores 8 --snakefile Snakefile --configfile config/config.yaml
```

## Result
### stoat
#### Binaire




#### Quantitatif





### Plink
#### Binaire
#### Quantitatif

Interprétation
10 changes: 5 additions & 5 deletions stoat/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from stoat import snarl_analyser
from stoat import utils
from stoat import p_value_analysis
from stoat import write_position
#from stoat import write_position
from stoat import gaf_creator
import time
import logging
Expand All @@ -20,7 +20,7 @@ def main() :
parser.add_argument("-n", "--name",type=utils.check_file, help='The input chromosome prefix reference file', required=False)
parser.add_argument("-t", "--threshold",type=list_snarl_paths.check_threshold, help='Children threshold', required=False)
parser.add_argument("-v", "--vcf",type=utils.check_format_vcf_file, help="Path to the merged VCF file (.vcf or .vcf.gz)", required=True)
parser.add_argument("-r", "--reference", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (only .vcf)", required=False)
# parser.add_argument("-r", "--reference", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (only .vcf)", required=False)
parser.add_argument("-l", "--listpath", type=utils.check_format_list_path, help="Path to the list paths", required=False)

group = parser.add_mutually_exclusive_group(required=True)
Expand Down Expand Up @@ -118,7 +118,7 @@ def main() :
vcf_object = snarl_analyser.SnarlProcessor(args.vcf, list_samples)
logger.info("Starting fill matrix...")
vcf_object.fill_matrix()
reference_vcf = args.reference if args.reference else args.vcf
# reference_vcf = args.reference if args.reference else args.vcf

# Step 3: P-value Analysis (Binary or Quantitative)
# Handle Binary Analysis
Expand All @@ -127,7 +127,7 @@ def main() :
output_snarl = os.path.join(output_dir, "binary_analysis.tsv")
logger.info("Binary table creation...")
vcf_object.binary_table(snarl_paths, pheno, kinship_matrix, covar, gaf, output_snarl)
logger.info("Writing position...")
# logger.info("Writing position...")
# write_position.write_pos_snarl(reference_vcf, output_snarl, "binary")

output_manh = os.path.join(output_dir, "manhattan_plot_binary.png")
Expand All @@ -147,7 +147,7 @@ def main() :
output_file = os.path.join(output_dir, "quantitative_analysis.tsv")
logger.info("Quantitative table creation...")
vcf_object.quantitative_table(snarl_paths, pheno, kinship_matrix, covar, output_file)
logger.info("Writing position...")
# logger.info("Writing position...")
# write_position.write_pos_snarl(reference_vcf, output_file, "quantitatif")

output_manh = os.path.join(output_dir, "manhattan_plot_quantitative.png")
Expand Down
96 changes: 68 additions & 28 deletions stoat/list_snarl_paths.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import bdsg # type: ignore
import argparse
from stoat import utils
import re
import time
import os

Expand Down Expand Up @@ -130,32 +129,68 @@ def add_to_path(next_child) :
# from the last thing in the path
stree.follow_net_edges(path[-1], pg, False, lambda n: add_to_path(n))

def save_snarls(stree, root, pg, reference, pp_overlay):

def save_snarls(stree, root, pg, ref_paths, ppo) :
snarls = []
chr_pos = []
snarls_pos = {}

def save_snarl_tree_node(net):
# given a node handle (dist index) return a position on a reference path
def get_node_position(node):
node_h = stree.get_handle(node, pg)
ret_pos = []

def step_callback(step_handle):
path_handle = pg.get_path_handle_of_step(step_handle)
path_name = pg.get_path_name(path_handle)
if path_name in reference :
position = pp_overlay.get_position_of_step(step_handle)
chr_pos.append((path_name, position))
if path_name in ref_paths:
position = ppo.get_position_of_step(step_handle)
ret_pos.append(path_name)
ret_pos.append(position)
return False
return True

if stree.is_snarl(net):
snarls.append([net, chr_pos[-1][0], chr_pos[-1][1]])

if stree.is_node(net):
handle_t = stree.get_handle(net, pg)
pg.for_each_step_on_handle(handle_t, step_callback)
pg.for_each_step_on_handle(node_h, step_callback)
return (ret_pos)

def save_snarl_tree_node(net):
# check if we can find a position for this snarl on the reference path
snarl_pos = get_net_start_position(net)
# if we couldn't find a position, use the parent's that we should have
# found and saved earlier
if len(snarl_pos) == 0:
par_net = stree.get_parent(net)
snarl_pos = snarls_pos[stree.net_handle_as_string(par_net)]
# save this position
snarls_pos[stree.net_handle_as_string(net)] = snarl_pos
# save snarl
if stree.is_snarl(net):
snarls.append([net, snarl_pos[0], snarl_pos[1]])
# explore children
if not stree.is_node(net) and not stree.is_sentinel(net):
stree.for_each_child(net, save_snarl_tree_node)
return True
return (True)

# given a handle for a node, snarl, chain, whatever, return a "start" position
def get_net_start_position(net):
# if node, look at its position
if stree.is_node(net):
return (get_node_position(net))
# otherwise check boundaries
bnode1 = stree.get_bound(net, True, False)
bnode1_p = get_node_position(bnode1)
bnode2 = stree.get_bound(net, False, False)
bnode2_p = get_node_position(bnode2)
# if one of the bondaries is not on a reference path, return it
if len(bnode1_p) == 0:
return (bnode1_p)
if len(bnode2_p) == 0:
return (bnode2_p)
assert bnode1_p[0] == bnode1_p[0], 'boundary nodes on different reference paths'
# as "start" position, let's return the smallest position
if bnode1_p[1] < bnode2_p[1]:
return (bnode1_p)
else:
return (bnode2_p)

stree.for_each_child(root, save_snarl_tree_node)
return snarls

Expand Down Expand Up @@ -200,8 +235,12 @@ def fill_pretty_paths(stree, pg, finished_paths) :

elif stree.is_chain(net) :
# if it's a chain, we need to write someting like ">Nl>*>Nr"
nodl = stree.get_bound(net, False, True)
nodr = stree.get_bound(net, True, False)
if stree.starts_at_start(net):
nodl = stree.get_bound(net, False, True)
nodr = stree.get_bound(net, True, False)
else:
nodl = stree.get_bound(net, True, True)
nodr = stree.get_bound(net, False, False)
ppath.addNodeHandle(nodl, stree)
ppath.addNode('*', '>')
ppath.addNodeHandle(nodr, stree)
Expand All @@ -217,10 +256,6 @@ def fill_pretty_paths(stree, pg, finished_paths) :
assert len(type_variants) == len(pretty_paths)
return pretty_paths, type_variants

def write_output_not_analyse(output_file, snarl_id, reason) :
with open(output_file, 'a') as outf:
outf.write('{}\t{}\n'.format(snarl_id, reason))

def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, children_treshold=50, bool_return=True) :

with open(output_file, 'w') as out_snarl, open(output_snarl_not_analyse, 'w') as out_fail:
Expand All @@ -237,8 +272,9 @@ def count_children(net):
return (True)

# for each snarl, lists paths through the netgraph and write to output TSV
for snarl, chr, pos in snarls:

for snarl_path_pos in snarls:

snarl = snarl_path_pos[0]
snarl_time = time.time()
snarl_id = find_snarl_id(stree, snarl)
not_break = True
Expand Down Expand Up @@ -267,10 +303,10 @@ def count_children(net):

# prepare path list to output and write each path directly to the file
pretty_paths, type_variants = fill_pretty_paths(stree, pg, finished_paths)
out_snarl.write('{}\t{}\t{}\t{}\t{}\n'.format(snarl_id, ','.join(pretty_paths), ','.join(type_variants), chr, pos))
out_snarl.write('{}\t{}\t{}\t{}\t{}\n'.format(snarl_id, ','.join(pretty_paths), ','.join(type_variants), snarl_path_pos[1], snarl_path_pos[2]))

if bool_return :
snarl_paths.append((snarl_id, pretty_paths, ','.join(type_variants), chr, pos))
snarl_paths.append((snarl_id, pretty_paths, ','.join(type_variants), snarl_path_pos[1], snarl_path_pos[2]))

paths_number_analysis += len(pretty_paths)

Expand All @@ -291,8 +327,8 @@ def count_children(net):
os.makedirs(output_dir, exist_ok=True)
output = os.path.join(output_dir, "list_snarl_paths.tsv")
output_snarl_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")

stree, pg, root, pp_overlay = parse_graph_tree(args.p, args.d)

snarls = save_snarls(stree, root, pg, reference, pp_overlay)
print(f"Total of snarls found : {len(snarls)}")
print("Saving snarl path decomposition...")
Expand All @@ -301,7 +337,11 @@ def count_children(net):
_, paths_number_analysis = loop_over_snarls_write(stree, snarls, pg, output, output_snarl_not_analyse, threshold, False)
print(f"Total of paths analyse : {paths_number_analysis}")

# python3 stoat/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test/test_list_snarl.tsv
# python3 stoat/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test
# vg find -x ../snarl_data/fly.gbz -r 5176878:5176884 -c 10 | vg view -dp - | dot -Tsvg -o ../snarl_data/subgraph.svg

# python3 stoat/list_snarl_paths.py -p tests/simulation/binary_data/pg.full.pg -d tests/simulation/binary_data/pg.dist -c tests/simulation/binary_data/pg.chromosome -o output/test/test_list_snarl.tsv
# binary
# python3 stoat/list_snarl_paths.py -p tests/simulation/binary_data/pg.full.pg -d tests/simulation/binary_data/pg.dist -c tests/simulation/binary_data/pg.chromosome -o output/test

# quantitative
# python3 stoat/list_snarl_paths.py -p tests/simulation/quantitative_data/pg.pg -d tests/simulation/quantitative_data/pg.dist -c tests/simulation/quantitative_data/pg.chromosome -o output/test
10 changes: 4 additions & 6 deletions stoat/snarl_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat
"""

common_headers = (
"CHR\tPOS\tSNARL\tTYPE\tREF\tALT\tP_FISHER\tP_CHI2\tALLELE_NUM\tMIN_ROW_INDEX\tNUM_COLUM\tINTER_GROUP\tAVERAGE"
"CHR\tPOS\tSNARL\tTYPE\tP_FISHER\tP_CHI2\tALLELE_NUM\tMIN_ROW_INDEX\tNUM_COLUM\tINTER_GROUP\tAVERAGE"
)
headers = f"{common_headers}\tGROUP_PATHS\n" if gaf else f"{common_headers}\n"

Expand All @@ -162,9 +162,8 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat

# Perform statistical tests and compute descriptive statistics
fisher_p_value, chi2_p_value, allele_number, min_sample, numb_colum, inter_group, average, group_paths = self.binary_stat_test(df, gaf)
ref = alt = "NA"
common_data = (
f"{chromosome}\t{position}\t{snarl}\t{type_var}\t{ref}\t{alt}\t"
f"{chromosome}\t{position}\t{snarl}\t{type_var}\t"
f"{fisher_p_value}\t{chi2_p_value}\t{allele_number}\t{min_sample}\t"
f"{numb_colum}\t{inter_group}\t{average}")
data = f"{common_data}\t{group_paths}\n" if gaf else f"{common_data}\n"
Expand All @@ -174,14 +173,13 @@ def binary_table(self, snarls:list, binary_groups:tuple[dict, dict], kinship_mat
def quantitative_table(self, snarls:list, quantitative_dict:dict, kinship_matrix:pd.DataFrame=None, covar:Optional[dict]=None, output:str="output/quantitative_output.tsv") :

with open(output, 'wb') as outf:
headers = 'CHR\tPOS\tSNARL\tTYPE\tREF\tALT\tRSQUARED\tBETA\tSE\tP\tALLELE_NUM\n'
headers = 'CHR\tPOS\tSNARL\tTYPE\tRSQUARED\tBETA\tSE\tP\tALLELE_NUM\n'
outf.write(headers.encode('utf-8'))
for snarl_info in snarls:
snarl, list_snarl, type_var, chromosome, position = snarl_info
df, allele_number = self.create_quantitative_table(list_snarl)
rsquared, beta, se, pvalue = self.linear_regression(df, quantitative_dict)
ref = alt = "NA"
data = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chromosome, position, snarl, type_var, ref, alt, rsquared, beta, se, pvalue, allele_number)
data = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chromosome, position, snarl, type_var, rsquared, beta, se, pvalue, allele_number)
outf.write(data.encode('utf-8'))

def identify_correct_path(self, decomposed_snarl:list, idx_srr_save:list) -> list:
Expand Down
Binary file modified tests/.DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion tests/match_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def load_file(file, threshold=None) :
total_succes = 0 # Track total significant entries
total_fail = 0
with open(file, 'r') as f:
#chrom_idx, pos_idx, pvalue_idx = identify_header(f)
chrom_idx, pos_idx, pvalue_idx = identify_header(f)

for line in f:
Expand Down
Loading

0 comments on commit c3d57c4

Please sign in to comment.