Merge pull request #36 from oxfordmmm/fix/allow-vcf-row-overlap

Fix/allow vcf row overlap
oxfordmmm · Jul 8, 2024 · f3116b5 · f3116b5
2 parents 7160454 + 4e6b95c
commit f3116b5
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 210 deletions.
diff --git a/VERSION b/VERSION
@@ -1,2 +1,2 @@
-1.3.3
+1.3.4
 
diff --git a/bin/to_piezo_catalogue.py b/bin/to_piezo_catalogue.py
@@ -21,14 +21,18 @@ def mutations_to_catalogue(mutations, filename, ref_genome, catalogue):
         ref_genome (str): Name of the genome
         catalogue (str): Name of the catalogue
     """
-    header = "GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
+    header = (
+        "GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,"
+        "PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
+    )
     common_all = f"{ref_genome},{catalogue},1.0,GARC1,RUS,NAN,"
 
     # Get all genes within the mutations
     genes = sorted(list(set([mutation.split("@")[0] for mutation in mutations])))
     with open(filename, "w") as f:
         f.write(header)
-        # Piezo requires all prediction values are in the catalogue, and all valid `PREDICTION_VALUES` include `R`
+        # Piezo requires all prediction values are in the catalogue,
+        # and all valid `PREDICTION_VALUES` include `R`
         # So add a dummy record with R as S and U are used elsewhere
         f.write(common_all + "NOTAGENE@A1!,R,{},{},{}\n")
         # Add general rules for genes
@@ -42,9 +46,10 @@ def mutations_to_catalogue(mutations, filename, ref_genome, catalogue):
 
 
 if __name__ == "__main__":
-    assert (
-        len(sys.argv) == 4
-    ), "Incorrect usage. Try: python to_piezo_catalogue.py <genome_path> <vcf_path> <output_filename>"
+    assert len(sys.argv) == 4, (
+        "Incorrect usage. Try: python to_piezo_catalogue.py "
+        "<genome_path> <vcf_path> <output_filename>"
+    )
     genome_path = sys.argv[1]
     vcf_path = sys.argv[2]
     output_filename = sys.argv[3]

diff --git a/gumpy/genome.py b/gumpy/genome.py
@@ -999,34 +999,31 @@ def __add__(self, vcf: VCFFile):
 
         # use the calls dict to change the nucleotide indicies in the copy of the genome
         for idx, type_ in tqdm(vcf.calls.keys(), disable=(not self.show_progress_bar)):
-            genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = vcf.calls[
-                (idx, type_)
-            ]["original_vcf_row"]
-
-            # deal with changes at a single nucleotide site
-            if type_ in ["snp", "null", "het"]:
-                # only set values if the idx is to a single nucleotide
-                genome.nucleotide_sequence[idx - 1] = vcf.calls[(idx, type_)]["call"]
-
-            # deal with insertions and deletions
-            elif type_ == "indel":
-                genome.is_indel[idx - 1] = True
-                genome.indel_nucleotides[idx - 1] = vcf.calls[(idx, type_)]["call"][1]
-
-                if vcf.calls[(idx, type_)]["call"][0] == "ins":
-                    genome.indel_length[idx - 1] = len(
-                        vcf.calls[(idx, type_)]["call"][1]
-                    )
-                else:
-                    genome.indel_length[idx - 1] = -1 * len(
-                        vcf.calls[(idx, type_)]["call"][1]
-                    )
-
-            elif type_ == "ref":
-                # These only exist due to reference calls
-                # They only made it this far as they are required to pull out minors at
-                #   these positions
-                pass
+            for item in vcf.calls[(idx, type_)]:
+                genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = item[
+                    "original_vcf_row"
+                ]
+
+                # deal with changes at a single nucleotide site
+                if type_ in ["snp", "null", "het"]:
+                    # only set values if the idx is to a single nucleotide
+                    genome.nucleotide_sequence[idx - 1] = item["call"]
+
+                # deal with insertions and deletions
+                elif type_ == "indel":
+                    genome.is_indel[idx - 1] = True
+                    genome.indel_nucleotides[idx - 1] = item["call"][1]
+
+                    if item["call"][0] == "ins":
+                        genome.indel_length[idx - 1] = len(item["call"][1])
+                    else:
+                        genome.indel_length[idx - 1] = -1 * len(item["call"][1])
+
+                elif type_ == "ref":
+                    # These only exist due to reference calls
+                    # They only made it this far as they are required to pull
+                    # out minors at these positions
+                    pass
 
         genome.minor_populations = []