Skip to content

Commit

Permalink
add list paths stoat starting
Browse files Browse the repository at this point in the history
  • Loading branch information
Plogeur committed Dec 17, 2024
1 parent ed79cc9 commit 9a267ba
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 89 deletions.
65 changes: 12 additions & 53 deletions src/list_snarl_paths.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import bdsg
import argparse
import utils
import re
from collections import defaultdict
import time
Expand Down Expand Up @@ -223,13 +224,13 @@ def write_output_not_analyse(output_file, snarl_id, reason) :
with open(output_file, 'a') as outf:
outf.write('{}\t{}\n'.format(snarl_id, reason))

def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, time_threshold=10) :
def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_analyse, time_threshold=10, bool_return=True) :

write_header_output(output_file)
write_header_output_not_analyse(output_snarl_not_analyse)

snarl_paths = defaultdict(list)
snarl_number_analysis = 0
paths_number_analysis = 0

# for each snarl, lists paths through the netgraph and write to output TSV
for snarl in snarls:
Expand All @@ -254,73 +255,31 @@ def loop_over_snarls_write(stree, snarls, pg, output_file, output_snarl_not_anal
pretty_paths, type_variants = fill_pretty_paths(stree, pg, finished_paths)

write_output(output_file, snarl_id, pretty_paths, type_variants)
snarl_paths[snarl_id].extend(pretty_paths)
snarl_number_analysis += len(pretty_paths)

return snarl_paths, snarl_number_analysis
if bool_return :
snarl_paths[snarl_id].extend(pretty_paths)
paths_number_analysis += len(pretty_paths)

def loop_over_snarls(stree, snarls, pg, output_snarl_not_analyse, threshold=50) :

write_header_output_not_analyse(output_snarl_not_analyse)

snarl_paths = defaultdict(list)
snarl_number_analysis = 0

children = [0]
def count_children(net):
children[0] += 1
return (True)

# for each snarl, lists paths through the netgraph and write to output TSV
for snarl in snarls:

children = [0]
snarl_id = find_snarl_id(stree, snarl)

stree.for_each_child(snarl, count_children)
if children[0] > threshold :
write_output_not_analyse(output_snarl_not_analyse, snarl_id, "too_many_children")
continue

# we'll traverse the netgraph starting at the left boundary
# init unfinished paths to the first boundary node
paths = [[stree.get_bound(snarl, False, True)]]
finished_paths = []
while len(paths) > 0 :

path = paths.pop()

if len(finished_paths) > 10000 :
write_output_not_analyse(output_snarl_not_analyse, snarl_id, "number_of_paths_to_hight")
break

follow_edges(stree, finished_paths, path, paths, pg)

# prepare path list to output and write each path directly to the file
pretty_paths = fill_pretty_paths(stree, pg, finished_paths)
snarl_paths[snarl_id].extend(pretty_paths)
snarl_number_analysis += len(pretty_paths)

return snarl_paths, snarl_number_analysis
return snarl_paths, paths_number_analysis

if __name__ == "__main__" :

parser = argparse.ArgumentParser('List path through the netgraph of each snarl in a pangenome')
parser.add_argument('-p', help='the input pangenome .pg file', required=True)
parser.add_argument('-d', help='the input distance index .dist file',required=True)
parser.add_argument('-t', type=check_threshold, help='time calculation threshold', required=False)
parser.add_argument('-p', type=utils.check_file, help='The input pangenome .pg file', required=True)
parser.add_argument('-d', type=utils.check_file, help='The input distance index .dist file', required=True)
parser.add_argument("-t", type=check_threshold, help='Children threshold', required=False)
parser.add_argument('-o', help='the output TSV file', type=str, required=True)
args = parser.parse_args()

stree, pg, root = parse_graph_tree(args.p, args.d)
snarls = save_snarls(stree, root)
print(f"Total of snarls found : {len(snarls)}")
print("Saving snarl path decomposition...")

output_snarl_not_analyse = "snarl_not_analyse.tsv"

threshold = args.t if args.t else 10
loop_over_snarls_write(stree, snarls, pg, args.o, output_snarl_not_analyse, threshold)
_, paths_number_analysis = loop_over_snarls_write(stree, snarls, pg, args.o, output_snarl_not_analyse, threshold, False)
print(f"Total of paths analyse : {paths_number_analysis}")

# python3 src/list_snarl_paths.py -p /home/mbagarre/Bureau/droso_data/fly/fly.pg -d /home/mbagarre/Bureau/droso_data/fly/fly.dist -o output/test/test_list_snarl.tsv
# vg find -x ../snarl_data/fly.gbz -r 5176878:5176884 -c 10 | vg view -dp - | dot -Tsvg -o ../snarl_data/subgraph.svg
Expand Down
35 changes: 21 additions & 14 deletions src/stoat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def main() :
parser.add_argument("-t", type=list_snarl_paths.check_threshold, help='Children threshold', required=False)
parser.add_argument("-v", type=utils.check_format_vcf_file, help="Path to the merged VCF file (.vcf or .vcf.gz)", required=True)
parser.add_argument("-r", type=utils.check_format_vcf_file, help="Path to the VCF file referencing all snarl positions (.vcf or .vcf.gz)", required=False)
parser.add_argument("--listpath", type=utils.check_format_list_path, help="Path to the list paths", required=False)

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-b", "--binary", type=utils.check_format_pheno, help="Path to the binary group file (.txt or .tsv)")
Expand Down Expand Up @@ -74,21 +75,28 @@ def main() :
pheno = utils.parse_pheno_quantitatif_file(args.quantitative)
utils.check_mathing(pheno, list_samples, "phenotype")

# Step 1: Parse the Pangenome Graph and Create Snarl Paths to Test
start_time = time.time()
logger.info("Starting snarl path decomposition...")
stree, pg, root = list_snarl_paths.parse_graph_tree(args.p, args.d)
snarls = list_snarl_paths.save_snarls(stree, root)
logger.info(f"Total of snarls found : {len(snarls)}")
logger.info("Saving snarl path decomposition...")
if not parser.listpath :
# Step 1: Parse the Pangenome Graph and Create Snarl Paths to Test
start_time = time.time()
logger.info("Starting snarl path decomposition...")
stree, pg, root = list_snarl_paths.parse_graph_tree(args.p, args.d)
snarls = list_snarl_paths.save_snarls(stree, root)
logger.info(f"Total of snarls found : {len(snarls)}")
logger.info("Saving snarl path decomposition...")

output_snarl_path_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")
output_snarl_path = os.path.join(output_dir, "snarl_paths.tsv")
output_snarl_path_not_analyse = os.path.join(output_dir, "snarl_not_analyse.tsv")
output_snarl_path = os.path.join(output_dir, "snarl_paths.tsv")

threshold = int(args.t) if args.t else 10
snarl_paths, snarl_number_analysis = list_snarl_paths.loop_over_snarls_write(stree, snarls, pg, output_snarl_path, output_snarl_path_not_analyse, threshold)
threshold = int(args.t) if args.t else 10
snarl_paths, paths_number_analysis = list_snarl_paths.loop_over_snarls_write(stree, snarls, pg, output_snarl_path, output_snarl_path_not_analyse, threshold)
logger.info(f"Total of paths analyse : {paths_number_analysis}")

logger.info(f"Total of paths analyse : {snarl_number_analysis}")
else :
if parser.p or parser.d :
logger.info("list snarls path are provided, .pg and .dist will be not analyse")
input_snarl_path = parser.listpath
snarl_paths, paths_number_analysis = utils.parse_snarl_path_file(input_snarl_path)
logger.info(f"Total of snarls found : {paths_number_analysis}")

# Step 2: Parse VCF Files and Fill the Matrix
vcf_object = snarl_analyser.SnarlProcessor(args.v, list_samples)
Expand Down Expand Up @@ -116,9 +124,8 @@ def main() :
p_value_analysis.qq_plot_binary(output_snarl, output_qq)
p_value_analysis.plot_manhattan_binary(output_snarl, output_manh)
if gaf :
snarl_path_dic = utils.parse_snarl_path_file(output_snarl_path)
output_gaf = os.path.join(output_dir, "group_paths.gaf")
gaf_creator.parse_input_file(output_snarl, snarl_path_dic, pg, output_gaf)
gaf_creator.parse_input_file(output_snarl, snarl_paths, pg, output_gaf)

# Handle Quantitative Analysis
elif args.quantitative:
Expand Down
73 changes: 54 additions & 19 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,18 @@ def parse_snarl_path_file(path_file: str) -> dict:

# Initialize an empty dictionary for the snarl paths
snarl_paths = defaultdict(list)
snarl_number_analysis = 0

# Read the file into a pandas DataFrame
df = pd.read_csv(path_file, sep='\t', dtype=str)
df = df[df['paths'].notna()]
df['paths'] = df['paths'].str.split(',')

# Create the dictionary with snarl as key and paths as values
for snarl, paths in zip(df['snarl'], df['paths']):
snarl_paths[snarl].extend(paths)
snarl_number_analysis += 1

return snarl_paths
return snarl_paths, snarl_number_analysis

def parse_covariate_file(covar_path : str) -> dict :

Expand All @@ -74,38 +75,72 @@ def check_mathing(elements, list_samples, file_name) :
if missing_elements :
raise ValueError(f"The following sample name from vcf are not present in {file_name} file : {missing_elements}")

def check_format_vcf_file(file_path : str) -> str:
def check_format_list_path(file_path : str) -> str:
"""
Function to check if the provided file path is a valid VCF file.
Function to check if the provided file path is a valid list path file.
"""
check_file(file_path)

if not file_path.lower().endswith('.vcf') and not file_path.lower().endswith('.vcf.gz'):
raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid VCF file. It must have a .vcf extension or .vcf.gz.")
with open(file_path, 'r') as file:
# Read and validate the header
first_line = file.readline().strip()
expected_header = 'snarl\tpaths\ttype'
if first_line != expected_header:
raise argparse.ArgumentTypeError(
f"The file must start with the following header: '{expected_header}' and be split by tabulation"
)

# Validate all other lines
for line_number, line in enumerate(file, start=2): # Start at 2 for line number after header
columns = line.strip().split('\t')
if len(columns) != 3:
raise argparse.ArgumentTypeError(
f"Line {line_number} must contain exactly 3 columns, but {len(columns)} columns were found."
)
if not all(isinstance(col, str) and col.strip() for col in columns):
raise argparse.ArgumentTypeError(
f"Line {line_number} contains empty or non-string values: {columns}"
)

return file_path

def check_format_group_snarl(file_path : str) -> str :
def check_format_vcf_file(file_path : str) -> str:
"""
Function to check if the provided file path is a valid group file.
Function to check if the provided file path is a valid VCF file.
"""
check_file(file_path)

if not file_path.lower().endswith('.txt') and not file_path.lower().endswith('.tsv'):
raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid group/snarl file. It must have a .txt extension or .tsv.")
return file_path

def check_format_pheno(file_path : str) -> str :
if not file_path.lower().endswith('.') and not file_path.lower().endswith('.vcf.gz'):
raise argparse.ArgumentTypeError(f"The file {file_path} is not a valid VCF file. It must have a .vcf extension or .vcf.gz.")
return file_path

def check_format_pheno(file_path: str) -> str:

check_file(file_path)

with open(file_path, 'r') as file:
first_line = file.readline().strip()

header = first_line.split('\t')
expected_header = ['FID', 'IID', 'PHENO']
if header != expected_header:
raise argparse.ArgumentTypeError(f"The file must contain the following headers: {expected_header} and be split by tabulation")

header = first_line.split('\t')
expected_header = ['FID', 'IID', 'PHENO']
if header != expected_header:
raise argparse.ArgumentTypeError(
f"The file must contain the following headers: {expected_header} and be split by tabulation"
)

# Validate all other lines
for line_number, line in enumerate(file, start=2): # Start at 2 for line number after header
columns = line.strip().split('\t')
if len(columns) != 3:
raise argparse.ArgumentTypeError(
f"Line {line_number} must contain exactly 3 columns, but {len(columns)} columns were found."
)
try:
float(columns[2]) # Check if the third column is a float or can be converted to one
except ValueError:
raise argparse.ArgumentTypeError(
f"Line {line_number} contains a non-numeric value in the last column: {columns[2]}"
)

return file_path

def check_covariate_file(file_path):
Expand Down
6 changes: 3 additions & 3 deletions tests/other_files/test_snarl_path.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
snarl paths
>1>3 >1>2>7
>2>3 >1>2>3>5>6,>1>2>4>6
snarl paths type
>1>3 >1>2>7 SNP
>2>3 >1>2>3>5>6,>1>2>4>6 DEL

0 comments on commit 9a267ba

Please sign in to comment.