Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/allow vcf row overlap #36

Merged
merged 5 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1.3.3
1.3.4

15 changes: 10 additions & 5 deletions bin/to_piezo_catalogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,18 @@ def mutations_to_catalogue(mutations, filename, ref_genome, catalogue):
ref_genome (str): Name of the genome
catalogue (str): Name of the catalogue
"""
header = "GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
header = (
"GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,"
"PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
)
common_all = f"{ref_genome},{catalogue},1.0,GARC1,RUS,NAN,"

# Get all genes within the mutations
genes = sorted(list(set([mutation.split("@")[0] for mutation in mutations])))
with open(filename, "w") as f:
f.write(header)
# Piezo requires all prediction values are in the catalogue, and all valid `PREDICTION_VALUES` include `R`
# Piezo requires all prediction values are in the catalogue,
# and all valid `PREDICTION_VALUES` include `R`
# So add a dummy record with R as S and U are used elsewhere
f.write(common_all + "NOTAGENE@A1!,R,{},{},{}\n")
# Add general rules for genes
Expand All @@ -42,9 +46,10 @@ def mutations_to_catalogue(mutations, filename, ref_genome, catalogue):


if __name__ == "__main__":
assert (
len(sys.argv) == 4
), "Incorrect usage. Try: python to_piezo_catalogue.py <genome_path> <vcf_path> <output_filename>"
assert len(sys.argv) == 4, (
"Incorrect usage. Try: python to_piezo_catalogue.py "
"<genome_path> <vcf_path> <output_filename>"
)
genome_path = sys.argv[1]
vcf_path = sys.argv[2]
output_filename = sys.argv[3]
Expand Down
53 changes: 25 additions & 28 deletions gumpy/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,34 +999,31 @@ def __add__(self, vcf: VCFFile):

# use the calls dict to change the nucleotide indicies in the copy of the genome
for idx, type_ in tqdm(vcf.calls.keys(), disable=(not self.show_progress_bar)):
genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = vcf.calls[
(idx, type_)
]["original_vcf_row"]

# deal with changes at a single nucleotide site
if type_ in ["snp", "null", "het"]:
# only set values if the idx is to a single nucleotide
genome.nucleotide_sequence[idx - 1] = vcf.calls[(idx, type_)]["call"]

# deal with insertions and deletions
elif type_ == "indel":
genome.is_indel[idx - 1] = True
genome.indel_nucleotides[idx - 1] = vcf.calls[(idx, type_)]["call"][1]

if vcf.calls[(idx, type_)]["call"][0] == "ins":
genome.indel_length[idx - 1] = len(
vcf.calls[(idx, type_)]["call"][1]
)
else:
genome.indel_length[idx - 1] = -1 * len(
vcf.calls[(idx, type_)]["call"][1]
)

elif type_ == "ref":
# These only exist due to reference calls
# They only made it this far as they are required to pull out minors at
# these positions
pass
for item in vcf.calls[(idx, type_)]:
genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = item[
"original_vcf_row"
]

# deal with changes at a single nucleotide site
if type_ in ["snp", "null", "het"]:
# only set values if the idx is to a single nucleotide
genome.nucleotide_sequence[idx - 1] = item["call"]

# deal with insertions and deletions
elif type_ == "indel":
genome.is_indel[idx - 1] = True
genome.indel_nucleotides[idx - 1] = item["call"][1]

if item["call"][0] == "ins":
genome.indel_length[idx - 1] = len(item["call"][1])
else:
genome.indel_length[idx - 1] = -1 * len(item["call"][1])

elif type_ == "ref":
# These only exist due to reference calls
# They only made it this far as they are required to pull
# out minors at these positions
pass

genome.minor_populations = []

Expand Down
Loading
Loading