Skip to content

Commit

Permalink
Merge pull request #315 from bacpop/assign-output-qfile
Browse files Browse the repository at this point in the history
Update cluster assign to write qcreport on qc failiures
  • Loading branch information
johnlees authored Jul 3, 2024
2 parents 27982a5 + db1bce4 commit f7e9416
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 12 deletions.
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.6.5'
__version__ = '2.6.6'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 2
Expand Down
9 changes: 5 additions & 4 deletions PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def assign_query_hdf5(dbFuncs,
from .plot import writeClusterCsv

from .qc import qcDistMat, qcQueryAssignments, prune_distance_matrix, \
prune_query_distance_matrix
prune_query_distance_matrix, write_qc_failure_report

from .sketchlib import addRandom

Expand Down Expand Up @@ -489,12 +489,13 @@ def assign_query_hdf5(dbFuncs,
# QC distance matrix
if qc_dict['run_qc']:
sys.stderr.write("Running QC on distance matrix\n")
seq_names_passing = \
frozenset(qcDistMat(qrDistMat, rNames, qNames, ref_db, qc_dict)[0])
failed_samples = frozenset(qNames) - seq_names_passing
seq_names_passing, failed_samples_dict = qcDistMat(qrDistMat, rNames, qNames, ref_db, qc_dict)
failed_samples = frozenset(qNames) - frozenset(seq_names_passing)
if len(failed_samples) > 0:
sys.stderr.write(f"{len(failed_samples)} samples failed:\n"
f"{','.join(failed_samples)}\n")
write_qc_failure_report(failed_samples, [failed_samples_dict], output)

if len(failed_samples) == len(qNames):
sys.exit(1)
else:
Expand Down
43 changes: 36 additions & 7 deletions PopPUNK/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,14 +472,43 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix,
overwrite=True, threads=threads)

# write failing & reasons
with open(f"{prefix}/{os.path.basename(prefix)}_qcreport.txt", 'w') as qc_file:
for sample in failed:
reasons = []
for fail_test in fail_dicts:
if sample in fail_test:
reasons += (fail_test[sample])
qc_file.write(f"{sample}\t{','.join(reasons)}\n")
write_qc_failure_report(failed, fail_dicts, prefix)

def write_qc_failure_report(failed_samples, fail_dicts, output_prefix):
"""
Writes a report of failed samples and their reasons to a file.
Parameters:
- failed_samples: A list of samples that have failed.
- fail_dicts: A list of dictionaries, each mapping samples to their failure reasons.
- output_prefix: The prefix for the output file path.
"""
# Accumulate output lines for each failed sample
failed_output_lines = [
f"{sample}\t{','.join(get_failure_reasons(sample, fail_dicts))}\n"
for sample in failed_samples
]
with open(f"{output_prefix}/{os.path.basename(output_prefix)}_qcreport.txt", 'w') as qc_file:
qc_file.writelines(failed_output_lines)

def get_failure_reasons(sample, fail_dicts):
"""
Retrieves all failure reasons for a given sample across multiple dictionaries.
Parameters:
- sample: The sample to retrieve failure reasons for.
- fail_dicts: A list of dictionaries, each mapping samples to their failure reasons.
Returns:
A list of failure reasons for the given sample.
"""
return [
reason
for fail_dict in fail_dicts
if sample in fail_dict
for reason in fail_dict[sample]
]

def pickTypeIsolate(prefix, refList):
"""Selects a type isolate as that with a minimal proportion
of missing data.
Expand Down

0 comments on commit f7e9416

Please sign in to comment.