noh-lab · bencap · Apr 27, 2021 · Apr 27, 2021 · Apr 27, 2021 · Apr 27, 2021
diff --git a/bbqs_master/bin/gff_product_fixing.py → bbqs_master/__init__.py b/bbqs_master/bin/gff_product_fixing.py → bbqs_master/__init__.py
diff --git a/bbqs_master/bin/__init__.py b/bbqs_master/bin/__init__.py
diff --git a/bbqs_master/bin/bed-to-json.py b/bbqs_master/bin/bed-to-json.py
@@ -1,14 +1,15 @@
 """
 This file puts a bed file in the format
 
-gene    start   end     type      prediction
+start   end     strand      id      name    details     phase     seq_id    replicon    source      type
 
 into the json format expected by jbrowse for display
 of genomic islands.
 """
 import argparse
 import json
 
+
 def read_bed(filepath):
     bed_records = {}
     ids = []
@@ -33,7 +34,7 @@ def build_formatted_record(record, curr):
         record[2] = -1
     if record[2] == "+":
         record[2] = 1
-           
+
     record[0] = int(record[0])
     record[1] = int(record[1])
 
@@ -46,15 +47,15 @@ def build_formatted_record(record, curr):
 def to_json_format(bed_data, chunk_name):
     returning = {}
     lazy_loader = {}
-    for replicon in bed_data.keys():
-        returning[replicon] = {}
-        lazy_loader[replicon] = bed_data[replicon]
-        nc_list = [[1, bed_data[replicon][0][1], bed_data[replicon][-1][2], 0]]
-        returning[replicon]["featureCount"] = len(bed_data[replicon])
-        returning[replicon]["formatVersion"] = 1
-        returning[replicon]["intervals"] = {}
-        returning[replicon]["intervals"]["classes"] = [ 
-                                                { "attributes": 
+    for seq_id in bed_data.keys():
+        returning[seq_id] = {}
+        lazy_loader[seq_id] = bed_data[seq_id]
+        nc_list = [[1, bed_data[seq_id][0][1], bed_data[seq_id][-1][2], 0]]
+        returning[seq_id]["featureCount"] = len(bed_data[seq_id])
+        returning[seq_id]["formatVersion"] = 1
+        returning[seq_id]["intervals"] = {}
+        returning[seq_id]["intervals"]["classes"] = [
+            {"attributes":
                                                     [
                                                         "Start",
                                                         "End",
@@ -63,48 +64,53 @@ def to_json_format(bed_data, chunk_name):
                                                         "Name",
                                                         "Note",
                                                         "Phase",
-                                                        "Replicon",
                                                         "Seq_id",
+                                                        "Replicon",
                                                         "Source",
                                                         "Type"
                                                     ],
-                                                    "isArrayAttr": {}
-                                                },
-                                                { "attributes": [
-                                                        "Start",
-                                                        "End",
-                                                        "Chunk"
-                                                ],
-                                                "isArrayAttr": {}
-                }
-                                            ]
-        returning[replicon]["intervals"]["count"] = len(bed_data[replicon])
-        returning[replicon]["intervals"]["maxEnd"] = bed_data[replicon][-1][2]
-        returning[replicon]["intervals"]["minStart"] = bed_data[replicon][0][1]
-        returning[replicon]["intervals"]["lazyClass"] = 1
-        returning[replicon]["intervals"]["nclist"] = nc_list
-        returning[replicon]["intervals"]["urlTemplate"] = chunk_name + "-{Chunk}.json"
+             "isArrayAttr": {}
+             },
+            {"attributes": [
+                "Start",
+                "End",
+                "Chunk"
+            ],
+                "isArrayAttr": {}
+            }
+        ]
+        returning[seq_id]["intervals"]["count"] = len(bed_data[seq_id])
+        returning[seq_id]["intervals"]["maxEnd"] = bed_data[seq_id][-1][2]
+        returning[seq_id]["intervals"]["minStart"] = bed_data[seq_id][0][1]
+        returning[seq_id]["intervals"]["lazyClass"] = 1
+        returning[seq_id]["intervals"]["nclist"] = nc_list
+        returning[seq_id]["intervals"]["urlTemplate"] = chunk_name + \
+            "-{Chunk}.json"
 
     return lazy_loader, returning
-    
+
 
 def write_json(track_data, out_name, chunk_name, strain):
-    for replicon in track_data[1].keys():
-        json.dump(track_data[1][replicon], open(f"data/tracks/{strain}/{replicon}/{out_name}", 'w'))
-        json.dump(track_data[0][replicon], open(f"data/tracks/{strain}/{replicon}/{chunk_name}-0.json", 'w'))
+    for seq_id in track_data[1].keys():
+        json.dump(track_data[1][seq_id], open(
+            f"data/tracks/{strain}/{seq_id}/{out_name}", 'w'))
+        json.dump(track_data[0][seq_id], open(
+            f"data/tracks/{strain}/{seq_id}/{chunk_name}-0.json", 'w'))
 
 
 def write_bed(filename, bed_data):
     with open(filename+".fixed", "w") as f:
-        f.write("start\tend\tstrand\tid\tname\tnote\tphase\treplicon\tseq_id\tsource\ttype\n")
+        f.write(
+            "start\tend\tstrand\tid\tname\tnote\tphase\tseq_id\treplicon\tsource\ttype\n")
         for record in bed_data:
             f.write("\t".join(record)+"\n")
 
     return
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Process a BED file into JSON format.')
+    parser = argparse.ArgumentParser(
+        description='Process a BED file into JSON format.')
     parser.add_argument('--in', "-i", type=str, required=False, default="islandData.bed",
                         help='the name of the BED file to be processed (default: %(default)s)')
     parser.add_argument('--out', "-o", type=str, required=False, default="trackData.json",
@@ -115,5 +121,6 @@ def write_bed(filename, bed_data):
                         help='the strain on which we are operating (default: %(default)s)')
 
     args = vars(parser.parse_args())
-    write_json(to_json_format(read_bed(args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
-    #write_bed(args["in"], read_bed(args["in"]))
+    write_json(to_json_format(read_bed(
+        args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
+    #write_bed(args["in"], read_bed(args["in"]))
diff --git a/bbqs_master/bin/bed_to_json.py b/bbqs_master/bin/bed_to_json.py
@@ -0,0 +1,126 @@
+"""
+This file puts a bed file in the format
+
+start   end     strand      id      name    details     phase     seq_id    replicon    source      type
+
+into the json format expected by jbrowse for display
+of genomic islands.
+"""
+import argparse
+import json
+
+
+def read_bed(filepath):
+    bed_records = {}
+    ids = []
+    curr = 0
+    with open(filepath) as bed:
+        for line in bed:
+            line = list(filter(None, line.split("\t")))
+            if line[3] in ids:
+                continue
+            if line[7] in bed_records.keys():
+                bed_records[line[7]].append(build_formatted_record(line, curr))
+            else:
+                bed_records[line[8]] = [build_formatted_record(line, curr)]
+            ids.append(line[4])
+            curr += 1
+
+    return bed_records
+
+
+def build_formatted_record(record, curr):
+    if record[2] == "-":
+        record[2] = -1
+    if record[2] == "+":
+        record[2] = 1
+
+    record[0] = int(record[0])
+    record[1] = int(record[1])
+
+    record.insert(0, 0)
+    record[-1].strip("\n")
+
+    return record
+
+
+def to_json_format(bed_data, chunk_name):
+    returning = {}
+    lazy_loader = {}
+    for seq_id in bed_data.keys():
+        returning[seq_id] = {}
+        lazy_loader[seq_id] = bed_data[seq_id]
+        nc_list = [[1, bed_data[seq_id][0][1], bed_data[seq_id][-1][2], 0]]
+        returning[seq_id]["featureCount"] = len(bed_data[seq_id])
+        returning[seq_id]["formatVersion"] = 1
+        returning[seq_id]["intervals"] = {}
+        returning[seq_id]["intervals"]["classes"] = [
+            {"attributes":
+                                                    [
+                                                        "Start",
+                                                        "End",
+                                                        "Strand",
+                                                        "Id",
+                                                        "Name",
+                                                        "Note",
+                                                        "Phase",
+                                                        "Seq_id",
+                                                        "Replicon",
+                                                        "Source",
+                                                        "Type"
+                                                    ],
+             "isArrayAttr": {}
+             },
+            {"attributes": [
+                "Start",
+                "End",
+                "Chunk"
+            ],
+                "isArrayAttr": {}
+            }
+        ]
+        returning[seq_id]["intervals"]["count"] = len(bed_data[seq_id])
+        returning[seq_id]["intervals"]["maxEnd"] = bed_data[seq_id][-1][2]
+        returning[seq_id]["intervals"]["minStart"] = bed_data[seq_id][0][1]
+        returning[seq_id]["intervals"]["lazyClass"] = 1
+        returning[seq_id]["intervals"]["nclist"] = nc_list
+        returning[seq_id]["intervals"]["urlTemplate"] = chunk_name + \
+            "-{Chunk}.json"
+
+    return lazy_loader, returning
+
+
+def write_json(track_data, out_name, chunk_name, strain):
+    for seq_id in track_data[1].keys():
+        json.dump(track_data[1][seq_id], open(
+            f"data/tracks/{strain}/{seq_id}/{out_name}", 'w'))
+        json.dump(track_data[0][seq_id], open(
+            f"data/tracks/{strain}/{seq_id}/{chunk_name}-0.json", 'w'))
+
+
+def write_bed(filename, bed_data):
+    with open(filename+".fixed", "w") as f:
+        f.write(
+            "start\tend\tstrand\tid\tname\tnote\tphase\tseq_id\treplicon\tsource\ttype\n")
+        for record in bed_data:
+            f.write("\t".join(record)+"\n")
+
+    return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Process a BED file into JSON format.')
+    parser.add_argument('--in', "-i", type=str, required=False, default="islandData.bed",
+                        help='the name of the BED file to be processed (default: %(default)s)')
+    parser.add_argument('--out', "-o", type=str, required=False, default="trackData.json",
+                        help='the name of the JSON file to write (default: %(default)s)')
+    parser.add_argument('--chunk', "-c", type=str, required=False, default="bed",
+                        help='the name of the chunk files to write to (default: %(default)s)')
+    parser.add_argument('--strain', "-s", type=str, required=False, default="BBQS859",
+                        help='the strain on which we are operating (default: %(default)s)')
+
+    args = vars(parser.parse_args())
+    write_json(to_json_format(read_bed(
+        args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
+    #write_bed(args["in"], read_bed(args["in"]))
diff --git a/bbqs_master/bin/build_bed_names.py b/bbqs_master/bin/build_bed_names.py
@@ -0,0 +1,45 @@
+"""
+appends to a name text file with searchable names for bed generated tracks
+"""
+import argparse
+import build_gff_names as gff
+import bed_to_json as bed
+
+
+def build_names(bed_data, strain):
+    """
+    Builds name record in the form below and returns a list of all such records
+    in the bed data representation list passed in
+
+    [[searchable, attributes, here], strain, id/name, seq_id, start, end]
+    """
+    names = []
+
+    for seq_id in bed_data:
+        for record in bed_data[seq_id]:
+            temp = []
+            temp.append([record[4],record[5],record[11].strip("\n")])
+            temp.append(strain)
+            temp.append(record[4])
+            temp.append(record[8])
+            temp.append(str(record[1]))
+            temp.append(str(record[2]))
+            names.append(temp)
+
+    return names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Build a names text file from a gff file.')
+    parser.add_argument('--in', "-i", type=str, required=False, default="features.bed",
+                        help='the name of the BED file to be processed (default: %(default)s)')
+    parser.add_argument('--out', "-o", type=str, required=False, default="names.txt",
+                        help='the name of the names file to write. NOTE: This should not be the full path and this program will prepend "data/tracks/strain/seq_id/" to the chosen name (default: %(default)s)')
+    parser.add_argument('--strain', "-s", type=str, required=False, default="BBQS859",
+                        help='the strain on which we are operating (default: %(default)s)')
+
+    args = vars(parser.parse_args())
+    bed_data = bed.read_bed(args["in"])
+    names = build_names(bed_data, args["strain"])
+    gff.write_names(args["out"], args["strain"], names, "a")