Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Name searching #6

Merged
merged 3 commits into from
Apr 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
Empty file added bbqs_master/bin/__init__.py
Empty file.
79 changes: 43 additions & 36 deletions bbqs_master/bin/bed-to-json.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""
This file puts a bed file in the format

gene start end type prediction
start end strand id name details phase seq_id replicon source type

into the json format expected by jbrowse for display
of genomic islands.
"""
import argparse
import json


def read_bed(filepath):
bed_records = {}
ids = []
Expand All @@ -33,7 +34,7 @@ def build_formatted_record(record, curr):
record[2] = -1
if record[2] == "+":
record[2] = 1

record[0] = int(record[0])
record[1] = int(record[1])

Expand All @@ -46,15 +47,15 @@ def build_formatted_record(record, curr):
def to_json_format(bed_data, chunk_name):
returning = {}
lazy_loader = {}
for replicon in bed_data.keys():
returning[replicon] = {}
lazy_loader[replicon] = bed_data[replicon]
nc_list = [[1, bed_data[replicon][0][1], bed_data[replicon][-1][2], 0]]
returning[replicon]["featureCount"] = len(bed_data[replicon])
returning[replicon]["formatVersion"] = 1
returning[replicon]["intervals"] = {}
returning[replicon]["intervals"]["classes"] = [
{ "attributes":
for seq_id in bed_data.keys():
returning[seq_id] = {}
lazy_loader[seq_id] = bed_data[seq_id]
nc_list = [[1, bed_data[seq_id][0][1], bed_data[seq_id][-1][2], 0]]
returning[seq_id]["featureCount"] = len(bed_data[seq_id])
returning[seq_id]["formatVersion"] = 1
returning[seq_id]["intervals"] = {}
returning[seq_id]["intervals"]["classes"] = [
{"attributes":
[
"Start",
"End",
Expand All @@ -63,48 +64,53 @@ def to_json_format(bed_data, chunk_name):
"Name",
"Note",
"Phase",
"Replicon",
"Seq_id",
"Replicon",
"Source",
"Type"
],
"isArrayAttr": {}
},
{ "attributes": [
"Start",
"End",
"Chunk"
],
"isArrayAttr": {}
}
]
returning[replicon]["intervals"]["count"] = len(bed_data[replicon])
returning[replicon]["intervals"]["maxEnd"] = bed_data[replicon][-1][2]
returning[replicon]["intervals"]["minStart"] = bed_data[replicon][0][1]
returning[replicon]["intervals"]["lazyClass"] = 1
returning[replicon]["intervals"]["nclist"] = nc_list
returning[replicon]["intervals"]["urlTemplate"] = chunk_name + "-{Chunk}.json"
"isArrayAttr": {}
},
{"attributes": [
"Start",
"End",
"Chunk"
],
"isArrayAttr": {}
}
]
returning[seq_id]["intervals"]["count"] = len(bed_data[seq_id])
returning[seq_id]["intervals"]["maxEnd"] = bed_data[seq_id][-1][2]
returning[seq_id]["intervals"]["minStart"] = bed_data[seq_id][0][1]
returning[seq_id]["intervals"]["lazyClass"] = 1
returning[seq_id]["intervals"]["nclist"] = nc_list
returning[seq_id]["intervals"]["urlTemplate"] = chunk_name + \
"-{Chunk}.json"

return lazy_loader, returning


def write_json(track_data, out_name, chunk_name, strain):
for replicon in track_data[1].keys():
json.dump(track_data[1][replicon], open(f"data/tracks/{strain}/{replicon}/{out_name}", 'w'))
json.dump(track_data[0][replicon], open(f"data/tracks/{strain}/{replicon}/{chunk_name}-0.json", 'w'))
for seq_id in track_data[1].keys():
json.dump(track_data[1][seq_id], open(
f"data/tracks/{strain}/{seq_id}/{out_name}", 'w'))
json.dump(track_data[0][seq_id], open(
f"data/tracks/{strain}/{seq_id}/{chunk_name}-0.json", 'w'))


def write_bed(filename, bed_data):
with open(filename+".fixed", "w") as f:
f.write("start\tend\tstrand\tid\tname\tnote\tphase\treplicon\tseq_id\tsource\ttype\n")
f.write(
"start\tend\tstrand\tid\tname\tnote\tphase\tseq_id\treplicon\tsource\ttype\n")
for record in bed_data:
f.write("\t".join(record)+"\n")

return


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process a BED file into JSON format.')
parser = argparse.ArgumentParser(
description='Process a BED file into JSON format.')
parser.add_argument('--in', "-i", type=str, required=False, default="islandData.bed",
help='the name of the BED file to be processed (default: %(default)s)')
parser.add_argument('--out', "-o", type=str, required=False, default="trackData.json",
Expand All @@ -115,5 +121,6 @@ def write_bed(filename, bed_data):
help='the strain on which we are operating (default: %(default)s)')

args = vars(parser.parse_args())
write_json(to_json_format(read_bed(args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
#write_bed(args["in"], read_bed(args["in"]))
write_json(to_json_format(read_bed(
args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
#write_bed(args["in"], read_bed(args["in"]))
126 changes: 126 additions & 0 deletions bbqs_master/bin/bed_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
This file puts a bed file in the format

start end strand id name details phase seq_id replicon source type

into the json format expected by jbrowse for display
of genomic islands.
"""
import argparse
import json


def read_bed(filepath):
bed_records = {}
ids = []
curr = 0
with open(filepath) as bed:
for line in bed:
line = list(filter(None, line.split("\t")))
if line[3] in ids:
continue
if line[7] in bed_records.keys():
bed_records[line[7]].append(build_formatted_record(line, curr))
else:
bed_records[line[8]] = [build_formatted_record(line, curr)]
ids.append(line[4])
curr += 1

return bed_records


def build_formatted_record(record, curr):
if record[2] == "-":
record[2] = -1
if record[2] == "+":
record[2] = 1

record[0] = int(record[0])
record[1] = int(record[1])

record.insert(0, 0)
record[-1].strip("\n")

return record


def to_json_format(bed_data, chunk_name):
returning = {}
lazy_loader = {}
for seq_id in bed_data.keys():
returning[seq_id] = {}
lazy_loader[seq_id] = bed_data[seq_id]
nc_list = [[1, bed_data[seq_id][0][1], bed_data[seq_id][-1][2], 0]]
returning[seq_id]["featureCount"] = len(bed_data[seq_id])
returning[seq_id]["formatVersion"] = 1
returning[seq_id]["intervals"] = {}
returning[seq_id]["intervals"]["classes"] = [
{"attributes":
[
"Start",
"End",
"Strand",
"Id",
"Name",
"Note",
"Phase",
"Seq_id",
"Replicon",
"Source",
"Type"
],
"isArrayAttr": {}
},
{"attributes": [
"Start",
"End",
"Chunk"
],
"isArrayAttr": {}
}
]
returning[seq_id]["intervals"]["count"] = len(bed_data[seq_id])
returning[seq_id]["intervals"]["maxEnd"] = bed_data[seq_id][-1][2]
returning[seq_id]["intervals"]["minStart"] = bed_data[seq_id][0][1]
returning[seq_id]["intervals"]["lazyClass"] = 1
returning[seq_id]["intervals"]["nclist"] = nc_list
returning[seq_id]["intervals"]["urlTemplate"] = chunk_name + \
"-{Chunk}.json"

return lazy_loader, returning


def write_json(track_data, out_name, chunk_name, strain):
for seq_id in track_data[1].keys():
json.dump(track_data[1][seq_id], open(
f"data/tracks/{strain}/{seq_id}/{out_name}", 'w'))
json.dump(track_data[0][seq_id], open(
f"data/tracks/{strain}/{seq_id}/{chunk_name}-0.json", 'w'))


def write_bed(filename, bed_data):
with open(filename+".fixed", "w") as f:
f.write(
"start\tend\tstrand\tid\tname\tnote\tphase\tseq_id\treplicon\tsource\ttype\n")
for record in bed_data:
f.write("\t".join(record)+"\n")

return


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Process a BED file into JSON format.')
parser.add_argument('--in', "-i", type=str, required=False, default="islandData.bed",
help='the name of the BED file to be processed (default: %(default)s)')
parser.add_argument('--out', "-o", type=str, required=False, default="trackData.json",
help='the name of the JSON file to write (default: %(default)s)')
parser.add_argument('--chunk', "-c", type=str, required=False, default="bed",
help='the name of the chunk files to write to (default: %(default)s)')
parser.add_argument('--strain', "-s", type=str, required=False, default="BBQS859",
help='the strain on which we are operating (default: %(default)s)')

args = vars(parser.parse_args())
write_json(to_json_format(read_bed(
args["in"]), args["chunk"]), args["out"], args["chunk"], args["strain"])
#write_bed(args["in"], read_bed(args["in"]))
45 changes: 45 additions & 0 deletions bbqs_master/bin/build_bed_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
appends to a name text file with searchable names for bed generated tracks
"""
import argparse
import build_gff_names as gff
import bed_to_json as bed


def build_names(bed_data, strain):
"""
Builds name record in the form below and returns a list of all such records
in the bed data representation list passed in

[[searchable, attributes, here], strain, id/name, seq_id, start, end]
"""
names = []

for seq_id in bed_data:
for record in bed_data[seq_id]:
temp = []
temp.append([record[4],record[5],record[11].strip("\n")])
temp.append(strain)
temp.append(record[4])
temp.append(record[8])
temp.append(str(record[1]))
temp.append(str(record[2]))
names.append(temp)

return names


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Build a names text file from a gff file.')
parser.add_argument('--in', "-i", type=str, required=False, default="features.bed",
help='the name of the BED file to be processed (default: %(default)s)')
parser.add_argument('--out', "-o", type=str, required=False, default="names.txt",
help='the name of the names file to write. NOTE: This should not be the full path and this program will prepend "data/tracks/strain/seq_id/" to the chosen name (default: %(default)s)')
parser.add_argument('--strain', "-s", type=str, required=False, default="BBQS859",
help='the strain on which we are operating (default: %(default)s)')

args = vars(parser.parse_args())
bed_data = bed.read_bed(args["in"])
names = build_names(bed_data, args["strain"])
gff.write_names(args["out"], args["strain"], names, "a")
Loading