From d43a1ace10e6b94b6e8a10a12b27600dcb773bb4 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 5 May 2020 14:04:22 -0400 Subject: [PATCH] Reformat code with black (after autopep8 to fix tabs in docstrings) --- bin/snaptools | 3 +- setup.py | 64 +- snaptools/__init__.py | 2 +- snaptools/add_bmat.py | 195 +++-- snaptools/add_gmat.py | 137 ++-- snaptools/add_pmat.py | 158 ++-- snaptools/alignment.py | 559 +++++++------ snaptools/call_peak.py | 137 ++-- snaptools/dex_fastq.py | 97 ++- snaptools/dump_barcode.py | 101 +-- snaptools/global_var.py | 241 +++++- snaptools/gtf.py | 36 +- snaptools/louvain.py | 36 +- snaptools/parser.py | 1606 ++++++++++++++++++++----------------- snaptools/snap.py | 687 ++++++++-------- snaptools/snap_del.py | 67 +- snaptools/snap_pre.py | 625 +++++++++------ snaptools/utilities.py | 249 +++--- test.py | 3 +- 19 files changed, 2875 insertions(+), 2128 deletions(-) diff --git a/bin/snaptools b/bin/snaptools index 9760419..9e69ff5 100755 --- a/bin/snaptools +++ b/bin/snaptools @@ -35,5 +35,4 @@ Description: snaptools - A toolkit for single nuclues ATAC-seq analysis. from snaptools.parser import parse_args if __name__ == '__main__': - parse_args() - + parse_args() diff --git a/setup.py b/setup.py index 0e6f44e..55d33d7 100755 --- a/setup.py +++ b/setup.py @@ -1,37 +1,39 @@ from setuptools import setup -snaptools_version = '1.4.8' +snaptools_version = "1.4.8" setup( - name='snaptools', - version=snaptools_version, - author='Rongxin Fang', - author_email='r4fang@gmail.com', - license='LICENSE', - packages=['snaptools'], - description='A module for working with snap files in Python', - url='https://github.com/r3fang/SnapTools.git', - python_requires='>=2.7', - - install_requires=[ - "pysam", - "h5py", - "numpy", - "pybedtools>=0.7", - "python-louvain", - "future" - ], - keywords = ["Bioinformatics pipeline", - "Single cell analysis", - "Epigenomics", - "Epigenetics", - "ATAC-seq", - "Chromatin Accessibility", - "Functional genomics"], - scripts = ["bin/snaptools"], - zip_safe=False) + name="snaptools", + version=snaptools_version, + author="Rongxin Fang", + author_email="r4fang@gmail.com", + license="LICENSE", + packages=["snaptools"], + description="A module for working with snap files in Python", + url="https://github.com/r3fang/SnapTools.git", + python_requires=">=2.7", + install_requires=[ + "pysam", + "h5py", + "numpy", + "pybedtools>=0.7", + "python-louvain", + "future", + ], + keywords=[ + "Bioinformatics pipeline", + "Single cell analysis", + "Epigenomics", + "Epigenetics", + "ATAC-seq", + "Chromatin Accessibility", + "Functional genomics", + ], + scripts=["bin/snaptools"], + zip_safe=False, +) -if __name__ == '__main__': - f = open("snaptools/__init__.py",'w') - f.write("__version__ = \'"+snaptools_version+"\'"+"\n") +if __name__ == "__main__": + f = open("snaptools/__init__.py", "w") + f.write("__version__ = '" + snaptools_version + "'" + "\n") f.close() diff --git a/snaptools/__init__.py b/snaptools/__init__.py index fedbd84..4963389 100755 --- a/snaptools/__init__.py +++ b/snaptools/__init__.py @@ -1 +1 @@ -__version__ = '1.4.8' +__version__ = "1.4.8" diff --git a/snaptools/add_bmat.py b/snaptools/add_bmat.py index 8499258..915ab68 100755 --- a/snaptools/add_bmat.py +++ b/snaptools/add_bmat.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -76,106 +77,166 @@ sys.exit(1) -def snap_bmat(snap_file, - bin_size_list, - tmp_folder, - verbose): - +def snap_bmat(snap_file, bin_size_list, tmp_folder, verbose): """ Pre-processing to create a snap file from a bam that contains alignments or a bed file that contains fragments. Args: -------- - snap_file: + snap_file: a snap format file. - + Optional -------- - bin_size_list: + bin_size_list: a list object contains all bin sizes [5000] - - verbose: + + verbose: a boolen variable indicates whether to output the progress [True]; """ if not os.path.exists(snap_file): - print(('error: ' + snap_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + snap_file + " does not exist!")) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); - sys.exit(1); - + print(("error: input file %s is not a snap file!" % snap_file)) + sys.exit(1) + # create the bin list - f = h5py.File(snap_file, "a", libver='earliest'); + f = h5py.File(snap_file, "a", libver="earliest") if "AM" in f: - print("error: AM - cell x bin accessibility matrix already exists, delete it first using snap-del ") + print( + "error: AM - cell x bin accessibility matrix already exists, delete it first using snap-del " + ) sys.exit(1) - - try: - genome_dict = dict(list(zip([item.decode() for item in f["HD"]["SQ"]["SN"][:]], f["HD"]["SQ"]["SL"][:]))) + + try: + genome_dict = dict( + list( + zip( + [item.decode() for item in f["HD"]["SQ"]["SN"][:]], + f["HD"]["SQ"]["SL"][:], + ) + ) + ) except KeyError: print("error: unable to read genome information") sys.exit(1) - + # extract the barcodes - barcode_dict = snaptools.snap.getBarcodesFromSnap(snap_file); + barcode_dict = snaptools.snap.getBarcodesFromSnap(snap_file) - bin_dict_list = collections.defaultdict(dict); + bin_dict_list = collections.defaultdict(dict) for bin_size in bin_size_list: - bin_dict = snaptools.utilities.getBinsFromGenomeSize(genome_dict, bin_size); - bin_dict_list[bin_size] = bin_dict; + bin_dict = snaptools.utilities.getBinsFromGenomeSize(genome_dict, bin_size) + bin_dict_list[bin_size] = bin_dict - num_barcode = len(barcode_dict); + num_barcode = len(barcode_dict) if verbose: - print("===== reading the barcodes and bins ======"); - print(("@AM\tnBinSize:%d"%len(list(bin_dict_list.keys())))); - print("@AM\tbinSizeList: %s" % str(list(bin_dict_list.keys()))); + print("===== reading the barcodes and bins ======") + print(("@AM\tnBinSize:%d" % len(list(bin_dict_list.keys())))) + print("@AM\tbinSizeList: %s" % str(list(bin_dict_list.keys()))) for bin_size in list(bin_dict_list.keys()): - print(("@AM\tbinSize:%d\tnBin:%d"%(bin_size, len(bin_dict_list[bin_size])))); - - idxList = collections.defaultdict(list); # barcode index list - idyList = collections.defaultdict(list); # bin index list - countList = collections.defaultdict(list); # number of count + print( + ("@AM\tbinSize:%d\tnBin:%d" % (bin_size, len(bin_dict_list[bin_size]))) + ) + + idxList = collections.defaultdict(list) # barcode index list + idyList = collections.defaultdict(list) # bin index list + countList = collections.defaultdict(list) # number of count barcode_id = 0 - for barcode in f["BD"]["name"]: - _chroms = f["FM"]["fragChrom"][(f["FM"]["barcodePos"][barcode_id] - 1):(f["FM"]["barcodePos"][barcode_id] + f["FM"]["barcodeLen"][barcode_id] - 1)]; - _chroms = [item.decode() for item in _chroms]; - _start = f["FM"]["fragStart"][(f["FM"]["barcodePos"][barcode_id] - 1):(f["FM"]["barcodePos"][barcode_id] + f["FM"]["barcodeLen"][barcode_id] - 1)] - _len = f["FM"]["fragLen"][(f["FM"]["barcodePos"][barcode_id] - 1):(f["FM"]["barcodePos"][barcode_id] + f["FM"]["barcodeLen"][barcode_id] - 1)] - frag_list_uniq = list(zip(_chroms, _start, _start + _len)); + for barcode in f["BD"]["name"]: + _chroms = f["FM"]["fragChrom"][ + (f["FM"]["barcodePos"][barcode_id] - 1) : ( + f["FM"]["barcodePos"][barcode_id] + + f["FM"]["barcodeLen"][barcode_id] + - 1 + ) + ] + _chroms = [item.decode() for item in _chroms] + _start = f["FM"]["fragStart"][ + (f["FM"]["barcodePos"][barcode_id] - 1) : ( + f["FM"]["barcodePos"][barcode_id] + + f["FM"]["barcodeLen"][barcode_id] + - 1 + ) + ] + _len = f["FM"]["fragLen"][ + (f["FM"]["barcodePos"][barcode_id] - 1) : ( + f["FM"]["barcodePos"][barcode_id] + + f["FM"]["barcodeLen"][barcode_id] + - 1 + ) + ] + frag_list_uniq = list(zip(_chroms, _start, _start + _len)) for bin_size in bin_dict_list: - bin_dict = bin_dict_list[bin_size]; - bins = collections.defaultdict(lambda : 0); + bin_dict = bin_dict_list[bin_size] + bins = collections.defaultdict(lambda: 0) for item in frag_list_uniq: - bin_chr = item[0]; - for bin_pos in set([int(item[1]/bin_size) * bin_size + 1, int(item[2]/bin_size) * bin_size + 1]): - bins[(bin_chr, bin_pos, bin_pos + bin_size - 1)] += 1; - + bin_chr = item[0] + for bin_pos in set( + [ + int(item[1] / bin_size) * bin_size + 1, + int(item[2] / bin_size) * bin_size + 1, + ] + ): + bins[(bin_chr, bin_pos, bin_pos + bin_size - 1)] += 1 + for key in bins: if key in bin_dict and barcode.decode() in barcode_dict: - idyList[bin_size].append(bin_dict[key]); - countList[bin_size].append(bins[key]); - idxList[bin_size].append(barcode_dict[barcode.decode()].id); - - barcode_id += 1; - del bin_dict, bins, frag_list_uniq; - - dt = h5py.special_dtype(vlen=bytes) - f.create_dataset("AM/nBinSize", data=len(bin_dict_list), dtype="uint32"); - f.create_dataset("AM/binSizeList", data=list(bin_dict_list.keys()), dtype="uint32"); - + idyList[bin_size].append(bin_dict[key]) + countList[bin_size].append(bins[key]) + idxList[bin_size].append(barcode_dict[barcode.decode()].id) + + barcode_id += 1 + del bin_dict, bins, frag_list_uniq + + dt = h5py.special_dtype(vlen=bytes) + f.create_dataset("AM/nBinSize", data=len(bin_dict_list), dtype="uint32") + f.create_dataset("AM/binSizeList", data=list(bin_dict_list.keys()), dtype="uint32") + for bin_size in bin_dict_list: - f.create_dataset("AM/"+str(bin_size)+"/binChrom",data=[np.string_(key[0]) for key in bin_dict_list[bin_size]], dtype=h5py.special_dtype(vlen=bytes), compression="gzip", compression_opts=9); - f.create_dataset("AM/"+str(bin_size)+"/binStart",data=[key[1] for key in bin_dict_list[bin_size]], dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("AM/"+str(bin_size)+"/idx", data=idxList[bin_size], dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("AM/"+str(bin_size)+"/idy", data=idyList[bin_size], dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("AM/"+str(bin_size)+"/count", data=countList[bin_size], dtype="uint8", compression="gzip", compression_opts=9); - f.close() + f.create_dataset( + "AM/" + str(bin_size) + "/binChrom", + data=[np.string_(key[0]) for key in bin_dict_list[bin_size]], + dtype=h5py.special_dtype(vlen=bytes), + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "AM/" + str(bin_size) + "/binStart", + data=[key[1] for key in bin_dict_list[bin_size]], + dtype="uint32", + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "AM/" + str(bin_size) + "/idx", + data=idxList[bin_size], + dtype="uint32", + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "AM/" + str(bin_size) + "/idy", + data=idyList[bin_size], + dtype="uint32", + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "AM/" + str(bin_size) + "/count", + data=countList[bin_size], + dtype="uint8", + compression="gzip", + compression_opts=9, + ) + f.close() return 0 diff --git a/snaptools/add_gmat.py b/snaptools/add_gmat.py index c8c1521..cdb16c2 100755 --- a/snaptools/add_gmat.py +++ b/snaptools/add_gmat.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -74,23 +75,19 @@ print("Package pybedtools not installed!") sys.exit(1) -def snap_gmat(snap_file, - gene_file, - buffer_size, - tmp_folder, - verbose): - + +def snap_gmat(snap_file, gene_file, buffer_size, tmp_folder, verbose): """ Create a cell x peak matrix from snap file. Required: -------- - snap_file: + snap_file: a snap format file; - gene_file: + gene_file: a bed format file contains gene locations; - + Optional: -------- tmp_folder: @@ -101,73 +98,89 @@ def snap_gmat(snap_file, """ if not os.path.exists(snap_file): - print(('error: ' + snap_file + ' does not exist!')) - sys.exit(1); - + print(("error: " + snap_file + " does not exist!")) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); - + print(("error: input file %s is not a snap file!" % snap_file)) + if not os.path.exists(gene_file): - print(('error: ' + gene_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + gene_file + " does not exist!")) + sys.exit(1) + # check if GM session already exists - f = h5py.File(snap_file, "r", libver='earliest'); + f = h5py.File(snap_file, "r", libver="earliest") if "GM" in f: - print(("error: cell x gene matrix already exists, delete it first using snap-del")); - sys.exit(1); - f.close(); - + print( + ("error: cell x gene matrix already exists, delete it first using snap-del") + ) + sys.exit(1) + f.close() + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(gene_file); + file_format = snaptools.utilities.checkFileFormat(gene_file) if file_format != "bed": - print(("error: input file %s is not a bed file!" % snap_file)); + print(("error: input file %s is not a bed file!" % snap_file)) # check if gene file is a bed file with the last column as gene - gene_list = set([str(item.name) for item in pybedtools.BedTool(gene_file)]); - #gene_dict = collections.OrderedDict(list(zip(gene_list, range(1, len(gene_list) + 1)))); - gene_dict = collections.OrderedDict(list(zip(gene_list, list(range(1, len(gene_list) + 1))))); - + gene_list = set([str(item.name) for item in pybedtools.BedTool(gene_file)]) + # gene_dict = collections.OrderedDict(list(zip(gene_list, range(1, len(gene_list) + 1)))); + gene_dict = collections.OrderedDict( + list(zip(gene_list, list(range(1, len(gene_list) + 1)))) + ) + # extract the barcodes - barcode_dict = getBarcodesFromSnap(snap_file); - + barcode_dict = getBarcodesFromSnap(snap_file) + # first cut the fragments into small piecies, write them down - fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder); - dump_read(snap_file, fout_frag.name, buffer_size, None, tmp_folder, True); + fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) + dump_read(snap_file, fout_frag.name, buffer_size, None, tmp_folder, True) # in parallel find the overlap cell and peaks - frag_bt = pybedtools.BedTool(fout_frag.name); - peak_bt = pybedtools.BedTool(gene_file); - - # count for frequency - cell_peak_arr = collections.defaultdict(list); + frag_bt = pybedtools.BedTool(fout_frag.name) + peak_bt = pybedtools.BedTool(gene_file) + + # count for frequency + cell_peak_arr = collections.defaultdict(list) for item in frag_bt.intersect(peak_bt, wa=True, wb=True): - key = str(item.fields[7]); + key = str(item.fields[7]) if key in gene_dict: - idy = gene_dict[key]; - barcode = item.name.split(":")[0]; - idx = barcode_dict[barcode].id; - cell_peak_arr[idx].append(idy); - - IDX_LIST = []; - IDY_LIST = []; - VAL_LIST = []; + idy = gene_dict[key] + barcode = item.name.split(":")[0] + idx = barcode_dict[barcode].id + cell_peak_arr[idx].append(idy) + + IDX_LIST = [] + IDY_LIST = [] + VAL_LIST = [] for barcode_id in cell_peak_arr: - d = collections.Counter(cell_peak_arr[barcode_id]); - IDX_LIST += [barcode_id] * len(d); + d = collections.Counter(cell_peak_arr[barcode_id]) + IDX_LIST += [barcode_id] * len(d) for peak_id in d: - IDY_LIST.append(peak_id); - VAL_LIST.append(d[peak_id]); - - f = h5py.File(snap_file, "a", libver='earliest'); - dt = h5py.special_dtype(vlen=bytes); - f.create_dataset("GM/name", data=[np.string_(item) for item in gene_dict.keys()], dtype=h5py.special_dtype(vlen=bytes), compression="gzip", compression_opts=9); - f.create_dataset("GM/idx", data=IDX_LIST, dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("GM/idy", data=IDY_LIST, dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("GM/count", data=VAL_LIST, dtype="uint8", compression="gzip", compression_opts=9); - f.close() + IDY_LIST.append(peak_id) + VAL_LIST.append(d[peak_id]) + + f = h5py.File(snap_file, "a", libver="earliest") + dt = h5py.special_dtype(vlen=bytes) + f.create_dataset( + "GM/name", + data=[np.string_(item) for item in gene_dict.keys()], + dtype=h5py.special_dtype(vlen=bytes), + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "GM/idx", data=IDX_LIST, dtype="uint32", compression="gzip", compression_opts=9 + ) + f.create_dataset( + "GM/idy", data=IDY_LIST, dtype="uint32", compression="gzip", compression_opts=9 + ) + f.create_dataset( + "GM/count", data=VAL_LIST, dtype="uint8", compression="gzip", compression_opts=9 + ) + f.close() # remove the temporary files - subprocess.check_call(["rm", fout_frag.name]); + subprocess.check_call(["rm", fout_frag.name]) return 0 diff --git a/snaptools/add_pmat.py b/snaptools/add_pmat.py index 8f01e52..b30c8ca 100755 --- a/snaptools/add_pmat.py +++ b/snaptools/add_pmat.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -75,28 +76,24 @@ print("Package pybedtools not installed!") sys.exit(1) -def snap_pmat(snap_file, - peak_file, - buffer_size, - tmp_folder, - verbose): - + +def snap_pmat(snap_file, peak_file, buffer_size, tmp_folder, verbose): """ Create a cell x peak matrix from snap file. Required: -------- - snap_file: + snap_file: a snap format file; - - peak_file: + + peak_file: a bed format file contains peak regions; - + Optional: -------- buffer_size: - Number of barcodes to store in the memory - + Number of barcodes to store in the memory + tmp_folder: folder to store temporarily created files; @@ -105,78 +102,103 @@ def snap_pmat(snap_file, """ if not os.path.exists(snap_file): - print(('error: ' + snap_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + snap_file + " does not exist!")) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); - + print(("error: input file %s is not a snap file!" % snap_file)) + if not os.path.exists(peak_file): - print(('error: ' + peak_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + peak_file + " does not exist!")) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(peak_file); + file_format = snaptools.utilities.checkFileFormat(peak_file) if file_format != "bed": - print(("error: input file %s is not a bed file!" % peak_file)); + print(("error: input file %s is not a bed file!" % peak_file)) sys.exit(1) - + # check if PM session already exists - f = h5py.File(snap_file, "r", libver='earliest'); + f = h5py.File(snap_file, "r", libver="earliest") if "PM" in f: - print("error: cell x peak matrix already exists, delete it using snap-del first"); - sys.exit(1); + print( + "error: cell x peak matrix already exists, delete it using snap-del first" + ) + sys.exit(1) f.close() - + # extract the barcodes - barcode_dict = getBarcodesFromSnap(snap_file); + barcode_dict = getBarcodesFromSnap(snap_file) # create the peaks; - peak_dict = collections.OrderedDict(); - i = 1; + peak_dict = collections.OrderedDict() + i = 1 for item in pybedtools.BedTool(peak_file): - peak_dict[(str(item.chrom), item.start, item.end)] = i; - i += 1; - + peak_dict[(str(item.chrom), item.start, item.end)] = i + i += 1 + # first cut the fragments into small piecies, write them down - fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder); - dump_read(snap_file, fout_frag.name, buffer_size, None, tmp_folder, True); + fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) + dump_read(snap_file, fout_frag.name, buffer_size, None, tmp_folder, True) # in parallel find the overlap cell and peaks - frag_bt = pybedtools.BedTool(fout_frag.name); - peak_bt = pybedtools.BedTool(peak_file); - - # count for frequency - cell_peak_arr = collections.defaultdict(list); + frag_bt = pybedtools.BedTool(fout_frag.name) + peak_bt = pybedtools.BedTool(peak_file) + + # count for frequency + cell_peak_arr = collections.defaultdict(list) for item in frag_bt.intersect(peak_bt, wa=True, wb=True): - key = (str(item.fields[4]), int(item.fields[5]), int(item.fields[6])); - idy = peak_dict[key]; - barcode = item.name.split(":")[0]; - idx = barcode_dict[barcode].id; - cell_peak_arr[idx].append(idy); - - IDX_LIST = []; - IDY_LIST = []; - VAL_LIST = []; + key = (str(item.fields[4]), int(item.fields[5]), int(item.fields[6])) + idy = peak_dict[key] + barcode = item.name.split(":")[0] + idx = barcode_dict[barcode].id + cell_peak_arr[idx].append(idy) + + IDX_LIST = [] + IDY_LIST = [] + VAL_LIST = [] for barcode_id in cell_peak_arr: - d = collections.Counter(cell_peak_arr[barcode_id]); - IDX_LIST += [barcode_id] * len(d); + d = collections.Counter(cell_peak_arr[barcode_id]) + IDX_LIST += [barcode_id] * len(d) for peak_id in d: - IDY_LIST.append(peak_id); - VAL_LIST.append(d[peak_id]); - - f = h5py.File(snap_file, "a", libver='earliest'); - dt = h5py.special_dtype(vlen=bytes); - f.create_dataset("PM/peakChrom", data=[np.string_(key[0]) for key in peak_dict], dtype=h5py.special_dtype(vlen=bytes), compression="gzip", compression_opts=9); - f.create_dataset("PM/peakStart", data=[key[1] for key in peak_dict], dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("PM/peakEnd", data=[key[2] for key in peak_dict], dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("PM/idx", data=IDX_LIST, dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("PM/idy", data=IDY_LIST, dtype="uint32", compression="gzip", compression_opts=9); - f.create_dataset("PM/count", data=VAL_LIST, dtype="uint8", compression="gzip", compression_opts=9); - f.close() + IDY_LIST.append(peak_id) + VAL_LIST.append(d[peak_id]) + + f = h5py.File(snap_file, "a", libver="earliest") + dt = h5py.special_dtype(vlen=bytes) + f.create_dataset( + "PM/peakChrom", + data=[np.string_(key[0]) for key in peak_dict], + dtype=h5py.special_dtype(vlen=bytes), + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "PM/peakStart", + data=[key[1] for key in peak_dict], + dtype="uint32", + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "PM/peakEnd", + data=[key[2] for key in peak_dict], + dtype="uint32", + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "PM/idx", data=IDX_LIST, dtype="uint32", compression="gzip", compression_opts=9 + ) + f.create_dataset( + "PM/idy", data=IDY_LIST, dtype="uint32", compression="gzip", compression_opts=9 + ) + f.create_dataset( + "PM/count", data=VAL_LIST, dtype="uint8", compression="gzip", compression_opts=9 + ) + f.close() # remove the temporary files - subprocess.check_call(["rm", fout_frag.name]); + subprocess.check_call(["rm", fout_frag.name]) return 0 - diff --git a/snaptools/alignment.py b/snaptools/alignment.py index 0126e68..f5fa339 100755 --- a/snaptools/alignment.py +++ b/snaptools/alignment.py @@ -9,41 +9,44 @@ import bz2 import collections + def count_barcode_cov_from_fastq(fname): """ Count barcode coverage from fastq file args: ----- - fname: + fname: a fastq file, support .gz, .txt, .bz2 file output: ----- - a dictionary contains barode and its coverage - """ + a dictionary contains barode and its coverage + """ if file_type(fname) == "gz": - fin = gzip.open(fname, 'rb'); + fin = gzip.open(fname, "rb") elif file_type(fname) == "bz2": - fin = bz2.BZ2File(fname, 'r'); + fin = bz2.BZ2File(fname, "r") elif file_type(fname) == "txt": - fin = open(fname, 'r'); + fin = open(fname, "r") else: - print("error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq") - sys.exit(1) - - barcode_cov = collections.defaultdict(lambda : 0) + print("error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq") + sys.exit(1) + + barcode_cov = collections.defaultdict(lambda: 0) while True: cur_name = fin.readline().strip()[1:] cur_read = fin.readline().strip() cur_plus = fin.readline().strip() cur_qual = fin.readline().strip() - if cur_name == "": break + if cur_name == "": + break cur_barcode = cur_name.split(":")[0] barcode_cov[cur_barcode] += 1 - + fin.close() - return(barcode_cov); + return barcode_cov + def filter_fastq(fname, barcodes, tmp_folder): """ @@ -51,44 +54,51 @@ def filter_fastq(fname, barcodes, tmp_folder): args: ------ - fname: + fname: a fastq file, support .gz, .txt, .bz2 file barcodes: - a list contains selected barcodes - + a list contains selected barcodes + tmp_folder: folder to store temp file - + output: ------ temporary file name """ if file_type(fname) == "gz": - fin = gzip.open(fname, 'rb'); + fin = gzip.open(fname, "rb") elif file_type(fname) == "bz2": - fin = bz2.BZ2File(fname, 'r'); + fin = bz2.BZ2File(fname, "r") elif file_type(fname) == "txt": - fin = open(fname, 'r'); + fin = open(fname, "r") else: - print(("error: unrecoginized fastq " + fname + " file format, only supports .gz, .bz2, .fastq")); - sys.exit(1) - + print( + ( + "error: unrecoginized fastq " + + fname + + " file format, only supports .gz, .bz2, .fastq" + ) + ) + sys.exit(1) + if len(barcodes) == 0: - print("error: no barcode is selected") - sys.exit(1) + print("error: no barcode is selected") + sys.exit(1) else: - barcodes = set(barcodes) - + barcodes = set(barcodes) + fout = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) - fout_name = fout.name; + fout_name = fout.name while True: cur_name = fin.readline() cur_read = fin.readline() cur_plus = fin.readline() cur_qual = fin.readline() - if cur_name == "": break + if cur_name == "": + break cur_barcode = cur_name.split(":")[0][1:] if cur_barcode in barcodes: fout.write(cur_name) @@ -97,15 +107,12 @@ def filter_fastq(fname, barcodes, tmp_folder): fout.write(cur_qual) fin.close() fout.close() - return(fout_name) - -def index_ref(input_fasta, - output_prefix, - path_to_aligner, - aligner, - num_threads): + return fout_name + + +def index_ref(input_fasta, output_prefix, path_to_aligner, aligner, num_threads): """ - Index sequences in the FASTA format + Index sequences in the FASTA format Required -------- @@ -113,25 +120,25 @@ def index_ref(input_fasta, Optional -------- - + output_prefix: prefix of output index (optional) - + path_to_aligner: directory path access to the aligner aligner: aligner name "bwa", "bowtie", "bowtie2" or "minimap2" num_threads: number of indexing threads [3]; """ - + # if the aligner path given, need to check the existance of the aligner if path_to_aligner != None: - path_to_aligner+="/" + path_to_aligner += "/" if not os.path.isdir(path_to_aligner): - print('Error: path_to_aligner is not a folder') - sys.exit(1); - if not os.path.exists(path_to_aligner+aligner): - print('Error: aligner does not exist') - sys.exit(1); + print("Error: path_to_aligner is not a folder") + sys.exit(1) + if not os.path.exists(path_to_aligner + aligner): + print("Error: aligner does not exist") + sys.exit(1) else: try: # pipe output to /dev/null for silence @@ -139,118 +146,125 @@ def index_ref(input_fasta, subprocess.Popen(aligner, stdout=null, stderr=null) null.close() except OSError as e: - print(('Error: ' + aligner + ' does not exist!')); - sys.exit(1); - path_to_aligner="" - + print(("Error: " + aligner + " does not exist!")) + sys.exit(1) + path_to_aligner = "" + if not os.path.exists(input_fasta): - print(('Error: ' + input_fasta + ' does not exist!')); - sys.exit(1); - + print(("Error: " + input_fasta + " does not exist!")) + sys.exit(1) + if output_prefix == None: output_prefix = os.path.splitext(input_fasta)[0] - + aligner = aligner.lower() if aligner not in ["bwa", "bowtie", "bowtie2", "minimap2"]: - print('Error: only support bwa, bowtie, bowtie2, minimap2') - sys.exit(1); + print("Error: only support bwa, bowtie, bowtie2, minimap2") + sys.exit(1) # minimap2 - if aligner.lower() == "minimap2": - subprocess.check_call([path_to_aligner+"minimap2", - "-t",str(num_threads), - "-d",output_prefix+".mmi", - input_fasta]) + if aligner.lower() == "minimap2": + subprocess.check_call( + [ + path_to_aligner + "minimap2", + "-t", + str(num_threads), + "-d", + output_prefix + ".mmi", + input_fasta, + ] + ) # bowtie2 if aligner.lower() == "bowtie2": - subprocess.check_call([path_to_aligner + "bowtie2-build", - "-f", input_fasta, - output_prefix]) + subprocess.check_call( + [path_to_aligner + "bowtie2-build", "-f", input_fasta, output_prefix] + ) # bowtie if aligner.lower() == "bowtie": - subprocess.check_call([path_to_aligner + "bowtie-build", - "-f", input_fasta, - output_prefix]) + subprocess.check_call( + [path_to_aligner + "bowtie-build", "-f", input_fasta, output_prefix] + ) # bwa if aligner.lower() == "bwa": - subprocess.check_call([path_to_aligner+"bwa", - "index", input_fasta]) + subprocess.check_call([path_to_aligner + "bwa", "index", input_fasta]) return 0 -def run_align_pe(input_reference, - input_fastq1, - input_fastq2, - output_bam, - aligner, - path_to_aligner, - read_fastq_command, - num_threads, - min_cov, - aligner_options, - if_sort, - tmp_folder, - overwrite, - verbose): +def run_align_pe( + input_reference, + input_fastq1, + input_fastq2, + output_bam, + aligner, + path_to_aligner, + read_fastq_command, + num_threads, + min_cov, + aligner_options, + if_sort, + tmp_folder, + overwrite, + verbose, +): """ Map paired-end single-cell ATAC-seq reads args: ----- - input_reference: + input_reference: reference genome file generated by index_reference - input_fastq1: + input_fastq1: a fastq file contains R1 reads, supports .fq, .fastq, .gz, .bz2 - input_fastq2: + input_fastq2: a fastq file contains R2 reads, supports .fq, .fastq, .gz, .bz2 - output_bam: + output_bam: a master bam file contains alignments - - path_to_aligner: + + path_to_aligner: directory path access to the aligner - aligner: + aligner: aligner name "bwa", "bowtie", "bowtie2" or "minimap2" - + aligner_options: a list of strings indicating options you'd like passed to aligner. (default for bowtie2) (default for bwa: "mem") (default for minimap2: "-ax sr") (default for bowtie: "-S -k 2 --best --strata --chunkmbs 64 -n 1") - - num_threads: + + num_threads: number of mapping threads [1]; - if_sort: + if_sort: if sort the alignment based on read name [True]; - tmp_folder: + tmp_folder: where to store the temporary files [None]; - - min_cov: - barcodes of fragments fewer than min_cov will be filtered before alingment; - read_fastq_command: + min_cov: + barcodes of fragments fewer than min_cov will be filtered before alingment; + + read_fastq_command: command to uncompress a compressed fastq file i.e. 'zcat', 'bzcat' [None]; overwrite: whether to overwrite the output file if it already exists [False]; - + verbose: - a boolen variable indicates whether to output the progress [True]; + a boolen variable indicates whether to output the progress [True]; """ # if the aligner path given, need to check the existance of the aligner if path_to_aligner != None: - path_to_aligner+="/" + path_to_aligner += "/" if not os.path.isdir(path_to_aligner): - print('Error: path_to_aligner is not a folder') - sys.exit(1); - if not os.path.exists(path_to_aligner+aligner): - print('Error: aligner does not exist') - sys.exit(1); + print("Error: path_to_aligner is not a folder") + sys.exit(1) + if not os.path.exists(path_to_aligner + aligner): + print("Error: aligner does not exist") + sys.exit(1) else: try: # pipe output to /dev/null for silence @@ -258,153 +272,172 @@ def run_align_pe(input_reference, subprocess.Popen(aligner, stdout=null, stderr=null) null.close() except OSError as e: - print(('Error: ' + aligner + ' does not exist!')); - sys.exit(1); - path_to_aligner="" - - if(tmp_folder!=None): + print(("Error: " + aligner + " does not exist!")) + sys.exit(1) + path_to_aligner = "" + + if tmp_folder != None: if not os.path.isdir(tmp_folder): - print('Error: tmp_folder is not a folder or does not exist') - sys.exit(1); - + print("Error: tmp_folder is not a folder or does not exist") + sys.exit(1) + # check the existance of input and output files if not os.path.exists(input_fastq1): - sys.exit('Error: ' + input_fastq1 + ' does not exist!'); + sys.exit("Error: " + input_fastq1 + " does not exist!") if not os.path.exists(input_fastq2): - sys.exit('Error: ' + input_fastq2 + ' does not exist!'); - - if os.path.isfile(output_bam): + sys.exit("Error: " + input_fastq2 + " does not exist!") + + if os.path.isfile(output_bam): if overwrite: - subprocess.check_call(["rm", output_bam]); + subprocess.check_call(["rm", output_bam]) else: - sys.exit("error: \'%s\' already exists, remove it first" % output_bam); + sys.exit("error: '%s' already exists, remove it first" % output_bam) if input_fastq1 == input_fastq2: - sys.exit("error: --input_fastq1 and --input_fastq2 are same file"); - + sys.exit("error: --input_fastq1 and --input_fastq2 are same file") + # check if can create the output_bam file try: with open(output_bam, "w") as outfile: - outfile.write('Hello World') - subprocess.check_call(["rm", output_bam]); + outfile.write("Hello World") + subprocess.check_call(["rm", output_bam]) except IOError: - print(("error: could not create %s, check if the folder exists." % output_bam)); + print(("error: could not create %s, check if the folder exists." % output_bam)) sys.exit(1) - + if min_cov > 0: - barcode_dict = count_barcode_cov_from_fastq(input_fastq1); - barcode_sel = set([key for key in barcode_dict if barcode_dict[key] > min_cov]); + barcode_dict = count_barcode_cov_from_fastq(input_fastq1) + barcode_sel = set([key for key in barcode_dict if barcode_dict[key] > min_cov]) if len(barcode_sel) == 0: - print("error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!") + print( + "error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!" + ) sys.exit(1) - input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder); - input_fastq2 = filter_fastq(input_fastq2, barcode_sel, tmp_folder); + input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder) + input_fastq2 = filter_fastq(input_fastq2, barcode_sel, tmp_folder) read_fastq_command = "cat" - + # check validity of aligner aligner = aligner.lower() if aligner not in ["bwa", "bowtie", "bowtie2", "minimap2"]: - sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2'); - + sys.exit("Error: only support bwa, bowtie, bowtie2, minimap2") + # default aligner option if aligner_options is None: if aligner.lower() == "minimap2": - aligner_options = ["-ax","sr"] + aligner_options = ["-ax", "sr"] elif aligner.lower() == "bowtie": - aligner_options = ["-X 1000", "-S", "-k 1", "-m 1", "--best", "--strata", - "--chunkmbs 3072", "-n 1", "-e 100"] + aligner_options = [ + "-X 1000", + "-S", + "-k 1", + "-m 1", + "--best", + "--strata", + "--chunkmbs 3072", + "-n 1", + "-e 100", + ] aligner_options.append("--phred33-quals") - elif aligner.lower() == "bowtie2": # bowtie2 + elif aligner.lower() == "bowtie2": # bowtie2 aligner_options = [] aligner_options.append("--phred33-quals") - elif aligner.lower() == "bwa": # bowtie2 + elif aligner.lower() == "bwa": # bowtie2 aligner_options = ["mem"] options = aligner_options - + # update num_threads if it is given in the aligner_options if aligner in ["bowtie", "bowtie2"]: if " ".join(options).find(" -p ") == -1: - options.append("-p "+str(num_threads)) + options.append("-p " + str(num_threads)) elif aligner in ["minimap2", "bwa"]: if " ".join(options).find(" -t ") == -1: - options.append("-t "+str(num_threads)) + options.append("-t " + str(num_threads)) else: - sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2'); - + sys.exit("Error: only support bwa, bowtie, bowtie2, minimap2") + # if cat_cmd is not given, automatically detect file type and choose cat_cmd if read_fastq_command == None: if file_type(input_fastq1) == "gz": read_fastq_command = "zcat" elif file_type(input_fastq1) == "bz2": read_fastq_command = "bzcat" - elif file_type(input_fastq1) == "txt": # .fq or fastq file + elif file_type(input_fastq1) == "txt": # .fq or fastq file read_fastq_command = "cat" else: - sys.exit('Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file'); + sys.exit( + "Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file" + ) - # mapping and write the alignments into a temporary file + # mapping and write the alignments into a temporary file if aligner.lower() == "minimap2": - args = [path_to_aligner+"minimap2"] + args = [path_to_aligner + "minimap2"] args.extend(options) args.append(input_reference) - args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") - args.append("<(" + read_fastq_command + " " + input_fastq2 + ")") + args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("<(" + read_fastq_command + " " + input_fastq2 + ")") elif aligner.lower() == "bowtie": - args = [path_to_aligner+"bowtie"] + args = [path_to_aligner + "bowtie"] args.extend(options) args.append(input_reference) - args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") - args.append("-2 " + "<(" + read_fastq_command + " " + input_fastq2 + ")") - elif aligner.lower() == "bowtie2": # bowtie2 - args = [path_to_aligner+"bowtie2"] + args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("-2 " + "<(" + read_fastq_command + " " + input_fastq2 + ")") + elif aligner.lower() == "bowtie2": # bowtie2 + args = [path_to_aligner + "bowtie2"] args.extend(options) - args.append("-x " + input_reference) - args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") - args.append("-2 " + "<(" + read_fastq_command + " " + input_fastq2 + ")") + args.append("-x " + input_reference) + args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("-2 " + "<(" + read_fastq_command + " " + input_fastq2 + ")") else: - args = [path_to_aligner+"bwa"] + args = [path_to_aligner + "bwa"] args.extend(options) args.append(input_reference) - args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") - args.append("<(" + read_fastq_command + " " + input_fastq2 + ")") - + args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("<(" + read_fastq_command + " " + input_fastq2 + ")") + ftmp = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) try: - subprocess.check_call(" ".join(args), stdout=ftmp, shell=True, executable='/bin/bash'); + subprocess.check_call( + " ".join(args), stdout=ftmp, shell=True, executable="/bin/bash" + ) except subprocess.CalledProcessError as e: - sys.exit('error: fail to run alignment, check if aligner and reference genome is correct!'); - ftmp.close(); + sys.exit( + "error: fail to run alignment, check if aligner and reference genome is correct!" + ) + ftmp.close() - if(if_sort): - pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name); + if if_sort: + pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name) else: samfile = pysam.AlignmentFile(ftmp.name, "r") fout = pysam.AlignmentFile(output_bam, "wb", template=samfile) for read in samfile.fetch(): fout.write(read) fout.close() - samfile.close() - subprocess.check_call(["rm", ftmp.name]); - + samfile.close() + subprocess.check_call(["rm", ftmp.name]) + # remove tmp fastq file after alignment if min_cov > 0: - subprocess.check_call(["rm", input_fastq1]); - subprocess.check_call(["rm", input_fastq2]); + subprocess.check_call(["rm", input_fastq1]) + subprocess.check_call(["rm", input_fastq2]) return 0 -def run_align_se(input_reference, - input_fastq1, - output_bam, - aligner, - path_to_aligner, - read_fastq_command, - num_threads, - min_cov, - aligner_options, - if_sort, - tmp_folder, - overwrite): +def run_align_se( + input_reference, + input_fastq1, + output_bam, + aligner, + path_to_aligner, + read_fastq_command, + num_threads, + min_cov, + aligner_options, + if_sort, + tmp_folder, + overwrite, +): """ Map single-cell ATAC-seq reads in single-end mode @@ -415,7 +448,7 @@ def run_align_se(input_reference, input_fastq1: a fastq file contains R1 reads, supports .fq, .fastq, .gz, .bz2 output_bam: a bam file contains alignments - + Optional -------- path_to_aligner: directory path access to the aligner @@ -427,29 +460,29 @@ def run_align_se(input_reference, (default for bowtie: "-X 1000 -S -k 1 -m 1 --best --strata --chunkmbs 64 -n 1") (default for bwa: "mem") (default for minimap2: "-ax sr --secondary=no") - + num_threads: number of mapping threads [3]; if_sort: if sort the alignment based on read name [True]; tmp_folder: where to store the temporary files [None]; - - min_cov: - barcodes of fragments fewer than min_cov will be filtered before alingment; - + + min_cov: + barcodes of fragments fewer than min_cov will be filtered before alingment; + read_fastq_command: command to uncompress a compressed fastq file i.e. 'zcat', 'bzcat' [None]; overwrite: whether to overwrite the output file if it already exists [False]; """ # if the aligner path given, need to check the existance of the aligner if path_to_aligner != None: - path_to_aligner+="/" + path_to_aligner += "/" if not os.path.isdir(path_to_aligner): - print('Error: path_to_aligner is not a folder') - sys.exit(1); - if not os.path.exists(path_to_aligner+aligner): - print('Error: aligner does not exist') - sys.exit(1); + print("Error: path_to_aligner is not a folder") + sys.exit(1) + if not os.path.exists(path_to_aligner + aligner): + print("Error: aligner does not exist") + sys.exit(1) else: try: # pipe output to /dev/null for silence @@ -457,125 +490,141 @@ def run_align_se(input_reference, subprocess.Popen(aligner, stdout=null, stderr=null) null.close() except OSError as e: - print(('Error: ' + aligner + ' does not exist!')); - sys.exit(1); - path_to_aligner="" - - if(tmp_folder!=None): + print(("Error: " + aligner + " does not exist!")) + sys.exit(1) + path_to_aligner = "" + + if tmp_folder != None: if not os.path.isdir(tmp_folder): - print('Error: tmp_folder is not a folder or does not exist') - sys.exit(1); - + print("Error: tmp_folder is not a folder or does not exist") + sys.exit(1) + # check the existance of input and output files if not os.path.exists(input_fastq1): - sys.exit('Error: ' + input_fastq1 + ' does not exist!'); - - if os.path.isfile(output_bam): + sys.exit("Error: " + input_fastq1 + " does not exist!") + + if os.path.isfile(output_bam): if overwrite: - subprocess.check_call(["rm", output_bam]); + subprocess.check_call(["rm", output_bam]) else: - sys.exit("error: \'%s\' already exists, remove it first" % output_bam); - + sys.exit("error: '%s' already exists, remove it first" % output_bam) + # check if can create the output_bam file try: with open(output_bam, "w") as outfile: - outfile.write('') - subprocess.check_call(["rm", output_bam]); + outfile.write("") + subprocess.check_call(["rm", output_bam]) except IOError: - print(("error: could not create %s, check if the folder exists." % output_bam)); + print(("error: could not create %s, check if the folder exists." % output_bam)) sys.exit(1) - + if min_cov > 0: - barcode_dict = count_barcode_cov_from_fastq(input_fastq1); - barcode_sel = set([key for key in barcode_dict if barcode_dict[key] > min_cov]); + barcode_dict = count_barcode_cov_from_fastq(input_fastq1) + barcode_sel = set([key for key in barcode_dict if barcode_dict[key] > min_cov]) if len(barcode_sel) == 0: - print("error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!") + print( + "error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!" + ) sys.exit(1) - input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder); + input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder) read_fastq_command = "cat" - + # check validity of aligner aligner = aligner.lower() if aligner not in ["bwa", "bowtie", "bowtie2", "minimap2"]: - sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2'); - + sys.exit("Error: only support bwa, bowtie, bowtie2, minimap2") + # default aligner option if aligner_options is None: if aligner.lower() == "minimap2": - aligner_options = ["-ax","sr","--secondary=no"] + aligner_options = ["-ax", "sr", "--secondary=no"] elif aligner.lower() == "bowtie": - aligner_options = ["-S", "-k 1", "-m 1", "--best", "--strata", - "--chunkmbs 3072", "-n 1", "-e 100"] + aligner_options = [ + "-S", + "-k 1", + "-m 1", + "--best", + "--strata", + "--chunkmbs 3072", + "-n 1", + "-e 100", + ] aligner_options.append("--phred33-quals") - elif aligner.lower() == "bowtie2": # bowtie2 + elif aligner.lower() == "bowtie2": # bowtie2 aligner_options = [] aligner_options.append("--phred33-quals") - elif aligner.lower() == "bwa": # bowtie2 + elif aligner.lower() == "bwa": # bowtie2 aligner_options = ["mem"] options = aligner_options - + # update num_threads if it is given in the aligner_options if aligner in ["bowtie", "bowtie2"]: if " ".join(options).find(" -p ") == -1: - options.append("-p "+str(num_threads)) + options.append("-p " + str(num_threads)) elif aligner in ["minimap2", "bwa"]: if " ".join(options).find(" -t ") == -1: - options.append("-t "+str(num_threads)) + options.append("-t " + str(num_threads)) else: - sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2'); - + sys.exit("Error: only support bwa, bowtie, bowtie2, minimap2") + # if cat_cmd is not given, automatically detect file type and choose cat_cmd if read_fastq_command == None: if file_type(input_fastq1) == "gz": read_fastq_command = "zcat" elif file_type(input_fastq1) == "bz2": read_fastq_command = "bzcat" - elif file_type(input_fastq1) == "txt": # .fq or fastq file + elif file_type(input_fastq1) == "txt": # .fq or fastq file read_fastq_command = "cat" else: - sys.exit('Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file'); + sys.exit( + "Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file" + ) - # mapping and write the alignments into a temporary file + # mapping and write the alignments into a temporary file if aligner.lower() == "minimap2": - args = [path_to_aligner+"minimap2"] + args = [path_to_aligner + "minimap2"] args.extend(options) args.append(input_reference) - args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") elif aligner.lower() == "bowtie": - args = [path_to_aligner+"bowtie"] + args = [path_to_aligner + "bowtie"] args.extend(options) args.append(input_reference) - args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") - elif aligner.lower() == "bowtie2": # bowtie2 - args = [path_to_aligner+"bowtie2"] + args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") + elif aligner.lower() == "bowtie2": # bowtie2 + args = [path_to_aligner + "bowtie2"] args.extend(options) - args.append("-x " + input_reference) - args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") + args.append("-x " + input_reference) + args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") else: - args = [path_to_aligner+"bwa"] + args = [path_to_aligner + "bwa"] args.extend(options) args.append(input_reference) - args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") - + args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") + ftmp = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) try: - subprocess.check_call(" ".join(args), stdout=ftmp, shell=True, executable='/bin/bash'); + subprocess.check_call( + " ".join(args), stdout=ftmp, shell=True, executable="/bin/bash" + ) except subprocess.CalledProcessError as e: - sys.exit('Error: failed to run alignment, check if aligner and reference genome is correct!'); - ftmp.close(); + sys.exit( + "Error: failed to run alignment, check if aligner and reference genome is correct!" + ) + ftmp.close() - if(if_sort): - pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name); + if if_sort: + pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name) else: samfile = pysam.AlignmentFile(ftmp.name, "r") fout = pysam.AlignmentFile(output_bam, "wb", template=samfile) for read in samfile.fetch(): fout.write(read) fout.close() - samfile.close() - subprocess.check_call(["rm", ftmp.name]); - + samfile.close() + subprocess.check_call(["rm", ftmp.name]) + # remove tmp fastq file after alignment if min_cov > 0: - subprocess.check_call(["rm", input_fastq1]); + subprocess.check_call(["rm", input_fastq1]) return 0 diff --git a/snaptools/call_peak.py b/snaptools/call_peak.py index b768d1c..bd750b3 100755 --- a/snaptools/call_peak.py +++ b/snaptools/call_peak.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -75,37 +76,39 @@ print("Package pybedtools not installed!") sys.exit(1) -def call_peak(snap_file, - output_prefix, - barcode_file, - gsize, - path_to_macs, - buffer_size, - macs_options, - tmp_folder): - + +def call_peak( + snap_file, + output_prefix, + barcode_file, + gsize, + path_to_macs, + buffer_size, + macs_options, + tmp_folder, +): """ Call peaks using selected barcodes. Required: -------- - snap_file: + snap_file: a snap format file; - barcode_file: + barcode_file: a txt file contains selected barcodes as the first column; - output_prefix: + output_prefix: experiment name, which will be used to generate output; - path_to_macs: + path_to_macs: a path to the folder contains excutable file macs2; Optional: -------- buffer_size: max number of barcodes to be stored in the memory - + macs_options: a list of strings indicating options you'd like passed to aligner. (default: "--nomodel --qval 1e-2 -B --SPMR --call-summits --keep-dup all"); @@ -113,16 +116,16 @@ def call_peak(snap_file, tmp_folder: folder to store intermedia files; """ - + # if the path_to_macs path given, need to check the existance of MACS1 if path_to_macs != None: - path_to_macs+="/" + path_to_macs += "/" if not os.path.isdir(path_to_macs): - print(('Error: ' + path_to_macs + ' is not a folder')); - sys.exit(1); - if not os.path.exists(path_to_macs+"macs2"): - print('Error: macs2 does not exist') - sys.exit(1); + print(("Error: " + path_to_macs + " is not a folder")) + sys.exit(1) + if not os.path.exists(path_to_macs + "macs2"): + print("Error: macs2 does not exist") + sys.exit(1) else: try: # pipe output to /dev/null for silence @@ -130,69 +133,79 @@ def call_peak(snap_file, subprocess.Popen(macs2, stdout=null, stderr=null) null.close() except OSError as e: - print('Error: macs2 does not exist!'); - sys.exit(1); - path_to_macs="" + print("Error: macs2 does not exist!") + sys.exit(1) + path_to_macs = "" # check temp folder - if(tmp_folder!=None): + if tmp_folder != None: if not os.path.isdir(tmp_folder): print("Error: 'tmp_folder' is not a folder or does not exist") - sys.exit(1); - + sys.exit(1) + # check wheather snap file exists if not os.path.exists(snap_file): - print(('error: ' + snap_file + ' does not exist!')); - sys.exit(1); + print(("error: " + snap_file + " does not exist!")) + sys.exit(1) # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("Error: input file %s is not a snap file!" % snap_file)); + print(("Error: input file %s is not a snap file!" % snap_file)) if not os.path.exists(barcode_file): - print(('error: ' + barcode_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + barcode_file + " does not exist!")) + sys.exit(1) + # default aligner option if macs_options is None: - macs_options = ["--nomodel", "--qval 1e-2", "-B", "--SPMR", "--call-summits", "--keep-dup all"]; + macs_options = [ + "--nomodel", + "--qval 1e-2", + "-B", + "--SPMR", + "--call-summits", + "--keep-dup all", + ] + + # read barcode from barcode file + barcode_sel = snaptools.snap.getBarcodesFromTxt(barcode_file) + if len(barcode_sel) == 0: + print(("Error: input file %s has zero barcodes!" % barcode_file)) - # read barcode from barcode file - barcode_sel = snaptools.snap.getBarcodesFromTxt(barcode_file); - if(len(barcode_sel) == 0): - print(("Error: input file %s has zero barcodes!" % barcode_file)); - # extract the barcodes - barcode_dict = snaptools.snap.getBarcodesFromSnapSimple(snap_file); + barcode_dict = snaptools.snap.getBarcodesFromSnapSimple(snap_file) # compare selected barcodes and total barcodes in snap file if len(list(set(barcode_sel.keys()) & set(barcode_dict.keys()))) == 0: - print('Error: selected barcodes does not exist in the snap file!') - sys.exit(1); - + print("Error: selected barcodes does not exist in the snap file!") + sys.exit(1) + # first cut the fragments into small piecies, write them down - fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder); - snaptools.snap.dump_read(snap_file, fout_frag.name, buffer_size, barcode_file, tmp_folder, True); + fout_frag = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) + snaptools.snap.dump_read( + snap_file, fout_frag.name, buffer_size, barcode_file, tmp_folder, True + ) # call peaks using macs - args = [path_to_macs+"macs2"]; - args.append("callpeak"); - args.append("-f AUTO"); - args.append("-t " + fout_frag.name); - args.append("-n " + output_prefix); - args.append("-g " + gsize); - args.extend(macs_options); + args = [path_to_macs + "macs2"] + args.append("callpeak") + args.append("-f AUTO") + args.append("-t " + fout_frag.name) + args.append("-n " + output_prefix) + args.append("-g " + gsize) + args.extend(macs_options) ftmp = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) try: - subprocess.check_call(" ".join(args), stdout=ftmp, shell=True, executable='/bin/bash'); + subprocess.check_call( + " ".join(args), stdout=ftmp, shell=True, executable="/bin/bash" + ) except subprocess.CalledProcessError as e: - sys.exit('error: fail to run macs2!'); - ftmp.close(); - + sys.exit("error: fail to run macs2!") + ftmp.close() + # remove the temporary files - subprocess.check_call(["rm", fout_frag.name]); - subprocess.check_call(["rm", ftmp.name]); + subprocess.check_call(["rm", fout_frag.name]) + subprocess.check_call(["rm", ftmp.name]) return 0 - diff --git a/snaptools/dex_fastq.py b/snaptools/dex_fastq.py index 3a65563..5f9ff32 100644 --- a/snaptools/dex_fastq.py +++ b/snaptools/dex_fastq.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -26,87 +26,84 @@ """ import os.path -import sys -import gzip +import sys +import gzip import collections from snaptools.utilities import file_type import bz2 -def dex_fastq(input_fastq, - output_fastq, - index_fastq_list - ): - + +def dex_fastq(input_fastq, output_fastq, index_fastq_list): """ De-multiplex fastq files by adding barcode to the beginning of each read name. Required: -------- - input_fastq: + input_fastq: a fastq format file that contains the sequencing reads; - output_fastq: + output_fastq: a fastq file contains output fastq file; - index_fastq_list: + index_fastq_list: a list of fastq files that contains the barcode """ # check wheather snap file exists if not os.path.exists(input_fastq): - print(('error: ' + input_fastq + ' does not exist!')); - sys.exit(1); + print(("error: " + input_fastq + " does not exist!")) + sys.exit(1) if os.path.exists(output_fastq): - print(('error: ' + output_fastq + ' already exists, remove it first!')); - sys.exit(1); - + print(("error: " + output_fastq + " already exists, remove it first!")) + sys.exit(1) + for index_fastq in index_fastq_list: if not os.path.exists(index_fastq): - print(('error: ' + index_fastq + ' does not exist!')); - sys.exit(1); - + print(("error: " + index_fastq + " does not exist!")) + sys.exit(1) + if file_type(input_fastq) == "gz": - fr1 = gzip.open(input_fastq, 'rb') + fr1 = gzip.open(input_fastq, "rb") elif file_type(input_fastq) == "bz2": - fr1 = bz2.BZ2File(input_fastq, 'r') + fr1 = bz2.BZ2File(input_fastq, "r") elif file_type(input_fastq) == "txt": - fr1 = open(input_fastq, 'r') - - index_files = [] + fr1 = open(input_fastq, "r") + + index_files = [] for index_fastq in index_fastq_list: if file_type(index_fastq) == "gz": - fix = gzip.open(index_fastq, 'rb') + fix = gzip.open(index_fastq, "rb") elif file_type(index_fastq) == "bz2": - fix = bz2.BZ2File(index_fastq, 'r') + fix = bz2.BZ2File(index_fastq, "r") elif file_type(index_fastq) == "txt": - fix = open(index_fastq, 'r') + fix = open(index_fastq, "r") index_files.append(fix) - if output_fastq.endswith("gz"): - fout = gzip.open(output_fastq, 'wb') + fout = gzip.open(output_fastq, "wb") elif output_fastq.endswith("bz2"): - fout = bz2.BZ2File(output_fastq, 'w') + fout = bz2.BZ2File(output_fastq, "w") else: - fout = open(output_fastq, 'w') - + fout = open(output_fastq, "w") + while True: cur_r1_name = fr1.readline().strip()[1:] if type(cur_r1_name) is bytes: - cur_r1_name = cur_r1_name.decode(); - if cur_r1_name == "": break + cur_r1_name = cur_r1_name.decode() + if cur_r1_name == "": + break cur_r1_read = fr1.readline().strip() cur_r1_plus = fr1.readline().strip() cur_r1_qual = fr1.readline().strip() if type(cur_r1_read) is bytes: - cur_r1_read = cur_r1_read.decode(); + cur_r1_read = cur_r1_read.decode() if type(cur_r1_qual) is bytes: - cur_r1_qual = cur_r1_qual.decode(); + cur_r1_qual = cur_r1_qual.decode() if type(cur_r1_plus) is bytes: - cur_r1_plus = cur_r1_plus.decode(); - + cur_r1_plus = cur_r1_plus.decode() + cur_idex_list = [] for fix in index_files: cur_name = fix.readline().strip()[1:] @@ -114,24 +111,24 @@ def dex_fastq(input_fastq, cur_plus = fix.readline().strip() cur_qual = fix.readline().strip() if type(cur_name) is bytes: - cur_name = cur_name.decode(); + cur_name = cur_name.decode() if type(cur_read) is bytes: - cur_read = cur_read.decode(); + cur_read = cur_read.decode() if type(cur_plus) is bytes: - cur_plus = cur_plus.decode(); + cur_plus = cur_plus.decode() if type(cur_qual) is bytes: - cur_qual = cur_qual.decode(); + cur_qual = cur_qual.decode() cur_idex_list.append(cur_read) - + cur_barcode = "".join(cur_idex_list) - - if not (cur_name.split()[0] == cur_r1_name.split()[0]): sys.exit("read name does not match") - fout.write(('@' + cur_barcode + ':' + cur_r1_name+"\n").encode('utf-8')) - fout.write((cur_r1_read+"\n").encode('utf-8')) - fout.write(("+\n").encode('utf-8')) - fout.write((cur_r1_qual+"\n").encode('utf-8')) + + if not (cur_name.split()[0] == cur_r1_name.split()[0]): + sys.exit("read name does not match") + fout.write(("@" + cur_barcode + ":" + cur_r1_name + "\n").encode("utf-8")) + fout.write((cur_r1_read + "\n").encode("utf-8")) + fout.write(("+\n").encode("utf-8")) + fout.write((cur_r1_qual + "\n").encode("utf-8")) fout.close() fr1.close() for fix in index_files: fix.close() - diff --git a/snaptools/dump_barcode.py b/snaptools/dump_barcode.py index da516bc..a7d7ba2 100755 --- a/snaptools/dump_barcode.py +++ b/snaptools/dump_barcode.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -75,84 +76,86 @@ sys.exit(1) -def dump_barcode(snap_file, - output_file, - barcode_file, - tmp_folder, - overwrite): - +def dump_barcode(snap_file, output_file, barcode_file, tmp_folder, overwrite): """ Dump reads from snap file into a bed file (of selected barcodes) Required Args: -------- - snap_file: + snap_file: a snap format file. - output_file: + output_file: a txt format file contains attributes of selected barcodes. - + Optional Args: -------- - + barcode_file: a txt file contains selected barcodes. - + overwrite: a boolen variable indicates whether to overwrite a file if it already exists """ # check if input snap file exists if not os.path.exists(snap_file): - print(('error: %s does not exist!' % snap_file)); - sys.exit(1); - + print(("error: %s does not exist!" % snap_file)) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); - + print(("error: input file %s is not a snap file!" % snap_file)) + # check the output bed file exists if os.path.exists(output_file): if overwrite == True: - subprocess.check_call(["rm", output_file]); + subprocess.check_call(["rm", output_file]) else: - print(('error: %s already exists, change --overwrite or remove it first!' % output_file)); - sys.exit(1); - + print( + ( + "error: %s already exists, change --overwrite or remove it first!" + % output_file + ) + ) + sys.exit(1) + # check if BD session exists - f = h5py.File(snap_file, "r", libver='earliest'); + f = h5py.File(snap_file, "r", libver="earliest") if "BD" not in f: - print("error: BD session does not exit in the snap file!"); - sys.exit(1); - f.close(); - - barcode_dict = getBarcodesFromSnap(snap_file); - + print("error: BD session does not exit in the snap file!") + sys.exit(1) + f.close() + + barcode_dict = getBarcodesFromSnap(snap_file) + # identify the barcodes if barcode_file is None: - barcode_list = list(barcode_dict.keys()); + barcode_list = list(barcode_dict.keys()) else: - barcode_list = list(getBarcodesFromTxt(barcode_file).keys()); - + barcode_list = list(getBarcodesFromTxt(barcode_file).keys()) + # write fragments down - fout = open(output_file, "w"); - res = ["Barcode", "TN", "UM", "SE", "SA", "PE", "PP", "PL", "US", "UQ", "CM", "\n"]; - fout.write("\t".join(map(str, res))) + fout = open(output_file, "w") + res = ["Barcode", "TN", "UM", "SE", "SA", "PE", "PP", "PL", "US", "UQ", "CM", "\n"] + fout.write("\t".join(map(str, res))) for barcode in barcode_list: if barcode in barcode_dict: - res = [barcode, barcode_dict[barcode].total, \ - barcode_dict[barcode].mapped, \ - barcode_dict[barcode].single, \ - barcode_dict[barcode].secondary, \ - barcode_dict[barcode].proper_paired, \ - barcode_dict[barcode].proper_flen, \ - barcode_dict[barcode].usable, \ - barcode_dict[barcode].uniq, \ - barcode_dict[barcode].chrM, \ - "\n"] + res = [ + barcode, + barcode_dict[barcode].total, + barcode_dict[barcode].mapped, + barcode_dict[barcode].single, + barcode_dict[barcode].secondary, + barcode_dict[barcode].proper_paired, + barcode_dict[barcode].proper_flen, + barcode_dict[barcode].usable, + barcode_dict[barcode].uniq, + barcode_dict[barcode].chrM, + "\n", + ] else: - res = [barcode] + ["0"] * 9 - fout.write("\t".join(map(str, res))) + res = [barcode] + ["0"] * 9 + fout.write("\t".join(map(str, res))) return 0 - diff --git a/snaptools/global_var.py b/snaptools/global_var.py index a53fbcb..ac9a645 100755 --- a/snaptools/global_var.py +++ b/snaptools/global_var.py @@ -1,39 +1,206 @@ -MIN_BIN_SIZE=1000 -MIN_FRAGMENT_LEN=0 -MAX_FRAGMENT_LEN=2000 -MIN_MAPQ=0 -MAGIC_STRING="SNAP" +MIN_BIN_SIZE = 1000 +MIN_FRAGMENT_LEN = 0 +MAX_FRAGMENT_LEN = 2000 +MIN_MAPQ = 0 +MAGIC_STRING = "SNAP" # this is a predfined genome list -GENOMELIST=["hg38", "hg19", "hg18", "hg17", "hg16", "mm10", "mm9", "mm8", "mm7", - "vicPac2", "vicPac1", "xenLae2", "allMis1", "dasNov3", "gadMor1", - "papAnu2", "papHam1", "bisBis1", "panPan2", "panPan1", "aptMan1", - "melUnd1", "otoGar3", "felCat8", "felCat5", "felCat4", "felCat3", - "felCat3", "galGal5", "galGal4", "galGal3", "galGal2", "panTro5", - "panTro4", "panTro3", "panTro2", "panTro1", "criGri1", "criGriChoV1", - "manPen1", "latCha1", "bosTau8", "bosTau7", "bosTau6", "bosTau4", - "bosTau3", "bosTau2", "macFas5", "canFam3", "canFam2", "canFam1", - "turTru2", "loxAfr3", "loxAfr1", "calMil1", "musFur1", "fr3", - "fr2", "fr1", "nomLeu3", "nomLeu2", "nomLeu1", "aquChr2", "rhiRox1", - "gorGor5", "gorGor4", "gorGor3", "chlSab2", "cavPor3", "eriEur2", - "eriEur1", "equCab2", "equCab1", "dipOrd1", "petMar3", "petMar2", - "petMar1", "braFlo1", "anoCar2", "anoCar1", "galVar1", "triMan1", - "calJac3", "calJac1", "oryLat2", "geoFor1", "pteVam1", "myoLuc2", - "balAcu1", "micMur2", "micMur1", "hetGla2", "hetGla1", "oreNil2", - "monDom5", "monDom4", "monDom1", "ponAbe2", "chrPic1", "ailMel1", - "susScr11", "susScr3", "susScr2", "ochPri3", "ochPri2", "ornAna2", - "ornAna1", "nasLar1", "oryCun2", "rn6", "rn5", "rn4", "rn3", - "rheMac8", "rheMac3", "rheMac2", "proCap1", "oviAri3", "oviAri1", - "sorAra2", "sorAra1", "choHof1", "speTri2", "saiBol1", "gasAcu1", - "tarSyr2", "tarSyr1", "sarHar1", "echTel2", "echTel1", "tetNig2", - "tetNig1", "nanPar1", "tupBel1", "melGal5", "melGal1", "macEug2", - "cerSim1", "xenTro9", "xenTro7", "xenTro3", "xenTro2", "xenTro1", - "taeGut2", "taeGut1", "danRer11", "danRer10", "danRer7", "danRer7", - "danRer6", "danRer5", "danRer4", "danRer3", "ci3", "ci2", "ci1", - "strPur2", "strPur1", "anoGam1", "apiMel3", "apiMel2", "apiMel1", - "droAna2", "droAna1", "droEre1", "droGri1", "dm6", "dm3", "dm2", - "dm1", "droMoj2", "droMoj1", "droPer1", "dp3", "dp2", "droSec1", "droSim1", - "droVir2", "droVir1", "droYak2", "droYak1", "caePb2", "caePb1", - "cb3", "ce10", "ce6", "ce4", "ce2", "caeJap1", "caeRem3", "caeRem2", - "priPac1", "sacCer3", "sacCer2", "sacCer1", "aplCal1", "eboVir3"] - +GENOMELIST = [ + "hg38", + "hg19", + "hg18", + "hg17", + "hg16", + "mm10", + "mm9", + "mm8", + "mm7", + "vicPac2", + "vicPac1", + "xenLae2", + "allMis1", + "dasNov3", + "gadMor1", + "papAnu2", + "papHam1", + "bisBis1", + "panPan2", + "panPan1", + "aptMan1", + "melUnd1", + "otoGar3", + "felCat8", + "felCat5", + "felCat4", + "felCat3", + "felCat3", + "galGal5", + "galGal4", + "galGal3", + "galGal2", + "panTro5", + "panTro4", + "panTro3", + "panTro2", + "panTro1", + "criGri1", + "criGriChoV1", + "manPen1", + "latCha1", + "bosTau8", + "bosTau7", + "bosTau6", + "bosTau4", + "bosTau3", + "bosTau2", + "macFas5", + "canFam3", + "canFam2", + "canFam1", + "turTru2", + "loxAfr3", + "loxAfr1", + "calMil1", + "musFur1", + "fr3", + "fr2", + "fr1", + "nomLeu3", + "nomLeu2", + "nomLeu1", + "aquChr2", + "rhiRox1", + "gorGor5", + "gorGor4", + "gorGor3", + "chlSab2", + "cavPor3", + "eriEur2", + "eriEur1", + "equCab2", + "equCab1", + "dipOrd1", + "petMar3", + "petMar2", + "petMar1", + "braFlo1", + "anoCar2", + "anoCar1", + "galVar1", + "triMan1", + "calJac3", + "calJac1", + "oryLat2", + "geoFor1", + "pteVam1", + "myoLuc2", + "balAcu1", + "micMur2", + "micMur1", + "hetGla2", + "hetGla1", + "oreNil2", + "monDom5", + "monDom4", + "monDom1", + "ponAbe2", + "chrPic1", + "ailMel1", + "susScr11", + "susScr3", + "susScr2", + "ochPri3", + "ochPri2", + "ornAna2", + "ornAna1", + "nasLar1", + "oryCun2", + "rn6", + "rn5", + "rn4", + "rn3", + "rheMac8", + "rheMac3", + "rheMac2", + "proCap1", + "oviAri3", + "oviAri1", + "sorAra2", + "sorAra1", + "choHof1", + "speTri2", + "saiBol1", + "gasAcu1", + "tarSyr2", + "tarSyr1", + "sarHar1", + "echTel2", + "echTel1", + "tetNig2", + "tetNig1", + "nanPar1", + "tupBel1", + "melGal5", + "melGal1", + "macEug2", + "cerSim1", + "xenTro9", + "xenTro7", + "xenTro3", + "xenTro2", + "xenTro1", + "taeGut2", + "taeGut1", + "danRer11", + "danRer10", + "danRer7", + "danRer7", + "danRer6", + "danRer5", + "danRer4", + "danRer3", + "ci3", + "ci2", + "ci1", + "strPur2", + "strPur1", + "anoGam1", + "apiMel3", + "apiMel2", + "apiMel1", + "droAna2", + "droAna1", + "droEre1", + "droGri1", + "dm6", + "dm3", + "dm2", + "dm1", + "droMoj2", + "droMoj1", + "droPer1", + "dp3", + "dp2", + "droSec1", + "droSim1", + "droVir2", + "droVir1", + "droYak2", + "droYak1", + "caePb2", + "caePb1", + "cb3", + "ce10", + "ce6", + "ce4", + "ce2", + "caeJap1", + "caeRem3", + "caeRem2", + "priPac1", + "sacCer3", + "sacCer2", + "sacCer1", + "aplCal1", + "eboVir3", +] diff --git a/snaptools/gtf.py b/snaptools/gtf.py index ebf8028..0d2d5f0 100755 --- a/snaptools/gtf.py +++ b/snaptools/gtf.py @@ -2,11 +2,20 @@ import gzip import re -GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score', - 'strand', 'frame'] -R_SEMICOLON = re.compile(r'\s*;\s*') -R_COMMA = re.compile(r'\s*,\s*') -R_KEYVALUE = re.compile(r'(\s+|\s*=\s*)') +GTF_HEADER = [ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", +] +R_SEMICOLON = re.compile(r"\s*;\s*") +R_COMMA = re.compile(r"\s*,\s*") +R_KEYVALUE = re.compile(r"(\s+|\s*=\s*)") + def readGTF(filename): """Open an optionally gzipped GTF file and return a pandas.DataFrame. @@ -27,14 +36,15 @@ def readGTF(filename): return result + def lines(filename): """Open an optionally gzipped GTF file and generate a dict for each line. """ - fn_open = gzip.open if filename.endswith('.gz') else open + fn_open = gzip.open if filename.endswith(".gz") else open with fn_open(filename) as fh: for line in fh: - if line.startswith('#'): + if line.startswith("#"): continue else: yield parse(line) @@ -45,7 +55,7 @@ def parse(line): """ result = {} - fields = line.rstrip().split('\t') + fields = line.rstrip().split("\t") for i, col in enumerate(GTF_HEADER): result[col] = _get_value(fields[i]) @@ -59,7 +69,7 @@ def parse(line): key, _, value = re.split(R_KEYVALUE, info, 1) # But sometimes it is just "value". except ValueError: - key = 'INFO{}'.format(i) + key = "INFO{}".format(i) value = info # Ignore the field if there is no value. if value: @@ -73,13 +83,13 @@ def _get_value(value): return None # Strip double and single quotes. - value = value.strip('"\'') + value = value.strip("\"'") # Return a list if the value has a comma. - if ',' in value: + if "," in value: value = re.split(R_COMMA, value) # These values are equivalent to None. - elif value in ['', '.', 'NA']: + elif value in ["", ".", "NA"]: return None - return value \ No newline at end of file + return value diff --git a/snaptools/louvain.py b/snaptools/louvain.py index a8cb778..0ee2149 100644 --- a/snaptools/louvain.py +++ b/snaptools/louvain.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -31,27 +31,23 @@ import os import sys -def louvain( - edge_file, - output_file, - resolution - ): - - ################################################################################################ + +def louvain(edge_file, output_file, resolution): + + ################################################################################################ if not os.path.exists(edge_file): - print(('Error: ' + edge_file + ' does not exist!')); - sys.exit(1); - - ################################################################################################ - G = nx.Graph(); + print(("Error: " + edge_file + " does not exist!")) + sys.exit(1) + + ################################################################################################ + G = nx.Graph() with open(edge_file) as fin: for line in fin: - elems = line.split(); - G.add_edge(elems[0],elems[1], weight=float(elems[2])) - - partition = community.best_partition(G, resolution = resolution) - + elems = line.split() + G.add_edge(elems[0], elems[1], weight=float(elems[2])) + + partition = community.best_partition(G, resolution=resolution) + with open(output_file, "w") as fout: for node in list(partition.keys()): - fout.write(str(node) + " " + str(partition[node] + 1) + "\n") - + fout.write(str(node) + " " + str(partition[node] + 1) + "\n") diff --git a/snaptools/parser.py b/snaptools/parser.py index 3779ab3..ff62c97 100755 --- a/snaptools/parser.py +++ b/snaptools/parser.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -30,823 +30,999 @@ import snaptools from snaptools.utilities import str2bool + def parse_args(): # create the top-level parser parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description = "Program: snaptools (A module for working with snap files in Python)\n" - + "Version: " + snaptools.__version__ + "\n" - + "Contact: Rongxin Fang" + "\n" - + "E-mail: r4fang@gmail.com" + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Program: snaptools (A module for working with snap files in Python)\n" + + "Version: " + + snaptools.__version__ + + "\n" + + "Contact: Rongxin Fang" + + "\n" + + "E-mail: r4fang@gmail.com", ) # create the sub-level parser - subparsers = parser.add_subparsers( - title="functions", - dest="command", - metavar="") + subparsers = parser.add_subparsers(title="functions", dest="command", metavar="") add_fastq_dex_subparser(subparsers) - add_index_ref_subparser(subparsers); - add_align_pe_subparser(subparsers); - add_align_se_subparser(subparsers); - add_snap_pre_subparser(subparsers); - add_snap_bmat_subparser(subparsers); - add_snap_pmat_subparser(subparsers); - add_snap_gmat_subparser(subparsers); - add_dump_read_subparser(subparsers); - add_dump_barcode_subparser(subparsers); - add_call_peak_subparser(subparsers); - add_louvain_subparser(subparsers); - add_snap_del_subparser(subparsers); - + add_index_ref_subparser(subparsers) + add_align_pe_subparser(subparsers) + add_align_se_subparser(subparsers) + add_snap_pre_subparser(subparsers) + add_snap_bmat_subparser(subparsers) + add_snap_pmat_subparser(subparsers) + add_snap_gmat_subparser(subparsers) + add_dump_read_subparser(subparsers) + add_dump_barcode_subparser(subparsers) + add_call_peak_subparser(subparsers) + add_louvain_subparser(subparsers) + add_snap_del_subparser(subparsers) + if len(sys.argv) > 1: - ## print out version - if (sys.argv[1] == '--version' or sys.argv[1] == '-v'): - print((snaptools.__version__)); - exit() - ## all functions - args = parser.parse_args() + # print out version + if sys.argv[1] == "--version" or sys.argv[1] == "-v": + print((snaptools.__version__)) + exit() + # all functions + args = parser.parse_args() else: - args = parser.parse_args(["-h"]) - exit() + args = parser.parse_args(["-h"]) + exit() if args.command == "dex-fastq": - from snaptools.dex_fastq import dex_fastq - dex_fastq(input_fastq=args.input_fastq, - output_fastq=args.output_fastq, - index_fastq_list=args.index_fastq_list - ) + from snaptools.dex_fastq import dex_fastq + + dex_fastq( + input_fastq=args.input_fastq, + output_fastq=args.output_fastq, + index_fastq_list=args.index_fastq_list, + ) if args.command == "index-genome": - from snaptools.alignment import index_ref - index_ref(input_fasta=args.input_fasta, - output_prefix=args.output_prefix, - aligner=args.aligner, - path_to_aligner=args.path_to_aligner, - num_threads=args.num_threads) - + from snaptools.alignment import index_ref + + index_ref( + input_fasta=args.input_fasta, + output_prefix=args.output_prefix, + aligner=args.aligner, + path_to_aligner=args.path_to_aligner, + num_threads=args.num_threads, + ) + if args.command == "align-paired-end": - from snaptools.alignment import run_align_pe - run_align_pe(input_reference=args.input_reference, - input_fastq1=args.input_fastq1, - input_fastq2=args.input_fastq2, - output_bam=args.output_bam, - aligner=args.aligner, - path_to_aligner=args.path_to_aligner, - num_threads=args.num_threads, - aligner_options=args.aligner_options, - if_sort=args.if_sort, - min_cov=args.min_cov, - read_fastq_command=args.read_fastq_command, - tmp_folder=args.tmp_folder, - overwrite=args.overwrite, - verbose=args.verbose) - + from snaptools.alignment import run_align_pe + + run_align_pe( + input_reference=args.input_reference, + input_fastq1=args.input_fastq1, + input_fastq2=args.input_fastq2, + output_bam=args.output_bam, + aligner=args.aligner, + path_to_aligner=args.path_to_aligner, + num_threads=args.num_threads, + aligner_options=args.aligner_options, + if_sort=args.if_sort, + min_cov=args.min_cov, + read_fastq_command=args.read_fastq_command, + tmp_folder=args.tmp_folder, + overwrite=args.overwrite, + verbose=args.verbose, + ) + if args.command == "align-single-end": - from snaptools.alignment import run_align_se - run_align_se(input_reference=args.input_reference, - input_fastq1=args.input_fastq1, - output_bam=args.output_bam, - aligner=args.aligner, - path_to_aligner=args.path_to_aligner, - num_threads=args.num_threads, - aligner_options=args.aligner_options, - if_sort=args.if_sort, - min_cov=args.min_cov, - read_fastq_command=args.read_fastq_command, - tmp_folder=args.tmp_folder, - overwrite=args.overwrite) - + from snaptools.alignment import run_align_se + + run_align_se( + input_reference=args.input_reference, + input_fastq1=args.input_fastq1, + output_bam=args.output_bam, + aligner=args.aligner, + path_to_aligner=args.path_to_aligner, + num_threads=args.num_threads, + aligner_options=args.aligner_options, + if_sort=args.if_sort, + min_cov=args.min_cov, + read_fastq_command=args.read_fastq_command, + tmp_folder=args.tmp_folder, + overwrite=args.overwrite, + ) + if args.command == "snap-view-head": from snaptools.snap import run_snap_view_head + run_snap_view_head(input_snap=args.input_snap) - + if args.command == "snap-pre": - from snaptools.snap_pre import snap_pre - snap_pre(input_file=args.input_file, - output_snap=args.output_snap, - genome_name=args.genome_name, - genome_size=args.genome_size, - min_mapq=args.min_mapq, - min_flen=args.min_flen, - max_flen=args.max_flen, - min_cov=args.min_cov, - max_num=args.max_num, - barcode_file=args.barcode_file, - keep_chrm=args.keep_chrm, - keep_single=args.keep_single, - keep_secondary=args.keep_secondary, - keep_discordant=args.keep_discordant, - tmp_folder=args.tmp_folder, - overwrite=args.overwrite, - qc_file=args.qc_file, - verbose=args.verbose) + from snaptools.snap_pre import snap_pre + + snap_pre( + input_file=args.input_file, + output_snap=args.output_snap, + genome_name=args.genome_name, + genome_size=args.genome_size, + min_mapq=args.min_mapq, + min_flen=args.min_flen, + max_flen=args.max_flen, + min_cov=args.min_cov, + max_num=args.max_num, + barcode_file=args.barcode_file, + keep_chrm=args.keep_chrm, + keep_single=args.keep_single, + keep_secondary=args.keep_secondary, + keep_discordant=args.keep_discordant, + tmp_folder=args.tmp_folder, + overwrite=args.overwrite, + qc_file=args.qc_file, + verbose=args.verbose, + ) if args.command == "dump-fragment": - from snaptools.snap import dump_read - dump_read(snap_file=args.snap_file, - output_file=args.output_file, - buffer_size=args.buffer_size, - barcode_file=args.barcode_file, - tmp_folder=args.tmp_folder, - overwrite=args.overwrite) + from snaptools.snap import dump_read + + dump_read( + snap_file=args.snap_file, + output_file=args.output_file, + buffer_size=args.buffer_size, + barcode_file=args.barcode_file, + tmp_folder=args.tmp_folder, + overwrite=args.overwrite, + ) if args.command == "dump-barcode": - from snaptools.dump_barcode import dump_barcode - dump_barcode(snap_file=args.snap_file, - output_file=args.output_file, - barcode_file=args.barcode_file, - tmp_folder=args.tmp_folder, - overwrite=args.overwrite) + from snaptools.dump_barcode import dump_barcode + + dump_barcode( + snap_file=args.snap_file, + output_file=args.output_file, + barcode_file=args.barcode_file, + tmp_folder=args.tmp_folder, + overwrite=args.overwrite, + ) if args.command == "snap-add-bmat": - from snaptools.add_bmat import snap_bmat - snap_bmat(snap_file=args.snap_file, - bin_size_list=args.bin_size_list, - tmp_folder=args.tmp_folder, - verbose=args.verbose) - + from snaptools.add_bmat import snap_bmat + + snap_bmat( + snap_file=args.snap_file, + bin_size_list=args.bin_size_list, + tmp_folder=args.tmp_folder, + verbose=args.verbose, + ) + if args.command == "snap-add-pmat": - from snaptools.add_pmat import snap_pmat - snap_pmat(snap_file=args.snap_file, - peak_file=args.peak_file, - buffer_size=args.buffer_size, - tmp_folder=args.tmp_folder, - verbose=args.verbose) + from snaptools.add_pmat import snap_pmat + + snap_pmat( + snap_file=args.snap_file, + peak_file=args.peak_file, + buffer_size=args.buffer_size, + tmp_folder=args.tmp_folder, + verbose=args.verbose, + ) if args.command == "snap-add-gmat": - from snaptools.add_gmat import snap_gmat - snap_gmat(snap_file=args.snap_file, - gene_file=args.gene_file, - buffer_size=args.buffer_size, - tmp_folder=args.tmp_folder, - verbose=args.verbose) + from snaptools.add_gmat import snap_gmat + + snap_gmat( + snap_file=args.snap_file, + gene_file=args.gene_file, + buffer_size=args.buffer_size, + tmp_folder=args.tmp_folder, + verbose=args.verbose, + ) if args.command == "call-peak": - from snaptools.call_peak import call_peak - call_peak(snap_file=args.snap_file, - barcode_file=args.barcode_file, - gsize=args.gsize, - output_prefix=args.output_prefix, - path_to_macs=args.path_to_macs, - buffer_size=args.buffer_size, - macs_options=args.macs_options, - tmp_folder=args.tmp_folder) - + from snaptools.call_peak import call_peak + + call_peak( + snap_file=args.snap_file, + barcode_file=args.barcode_file, + gsize=args.gsize, + output_prefix=args.output_prefix, + path_to_macs=args.path_to_macs, + buffer_size=args.buffer_size, + macs_options=args.macs_options, + tmp_folder=args.tmp_folder, + ) + if args.command == "louvain": - from snaptools.louvain import louvain - louvain(edge_file=args.edge_file, - output_file=args.output_file, - resolution=args.resolution) + from snaptools.louvain import louvain + + louvain( + edge_file=args.edge_file, + output_file=args.output_file, + resolution=args.resolution, + ) if args.command == "snap-del": - from snaptools.snap_del import snap_del - snap_del(snap_file=args.snap_file, - session_name=args.session_name) + from snaptools.snap_del import snap_del + + snap_del(snap_file=args.snap_file, session_name=args.session_name) + def add_fastq_dex_subparser(subparsers): - parser_build = subparsers.add_parser( - "dex-fastq", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="De-multicomplex fastq file.") - - # add options - parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--input-fastq", - type=str, - required=True, - help="fastq file contains the sequencing reads") - - parser_build_req.add_argument("--output-fastq", - type=str, - required=True, - help="output decomplexed fastq file") - - parser_build_req.add_argument("--index-fastq-list", - type=str, - required=True, - nargs="+", - help="a list of fastq files that contain the cell indices.") - + parser_build = subparsers.add_parser( + "dex-fastq", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="De-multicomplex fastq file.", + ) + + # add options + parser_build_req = parser_build.add_argument_group("required inputs") + parser_build_req.add_argument( + "--input-fastq", + type=str, + required=True, + help="fastq file contains the sequencing reads", + ) + + parser_build_req.add_argument( + "--output-fastq", type=str, required=True, help="output decomplexed fastq file" + ) + + parser_build_req.add_argument( + "--index-fastq-list", + type=str, + required=True, + nargs="+", + help="a list of fastq files that contain the cell indices.", + ) + + def add_index_ref_subparser(subparsers): - # create the parser for the "DMRfind" command - parser_build = subparsers.add_parser( - "index-genome", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Index reference genome." - ) - - # add options - parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--input-fasta", - type=str, - required=True, - help="genome fasta file to build the index from") - - parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--output-prefix", - type=str, - default=None, - help="prefix of indexed file") - - parser_build_opt.add_argument("--aligner", - type=str, - default="bwa", - help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.") - - parser_build_opt.add_argument("--path-to-aligner", - type=str, - default=None, - help="path to fold that contains bwa") - - parser_build_opt.add_argument("--num-threads", - type=int, - default=3, - help="=number of indexing threads") + # create the parser for the "DMRfind" command + parser_build = subparsers.add_parser( + "index-genome", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Index reference genome.", + ) + + # add options + parser_build_req = parser_build.add_argument_group("required inputs") + parser_build_req.add_argument( + "--input-fasta", + type=str, + required=True, + help="genome fasta file to build the index from", + ) + + parser_build_opt = parser_build.add_argument_group("optional inputs") + parser_build_opt.add_argument( + "--output-prefix", type=str, default=None, help="prefix of indexed file" + ) + + parser_build_opt.add_argument( + "--aligner", + type=str, + default="bwa", + help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.", + ) + + parser_build_opt.add_argument( + "--path-to-aligner", + type=str, + default=None, + help="path to fold that contains bwa", + ) + + parser_build_opt.add_argument( + "--num-threads", type=int, default=3, help="=number of indexing threads" + ) + def add_align_pe_subparser(subparsers): - parser_build = subparsers.add_parser( - "align-paired-end", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Align paired-end reads.") - - # add options - parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--input-reference", - type=str, - required=True, - help="reference genome file contains the reference genome that reads are mapped against, " - + "the genome index must be under the same folder") - - parser_build_req.add_argument("--input-fastq1", - type=str, - required=True, - help="fastq file contains R1 reads, currently supports fastq, gz, bz2 file") - - parser_build_req.add_argument("--input-fastq2", - type=str, - required=True, - help="fastq file contains R2 reads, currently supports fastq, gz, bz2 file") - - parser_build_req.add_argument("--output-bam", - type=str, - required=True, - help="output bam file contains unfiltered alignments") - - parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--aligner", - type=str, - default="bwa", - help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.") - - parser_build_opt.add_argument("--path-to-aligner", - type=str, - default=None, - help="path to fold that contains bwa") - - parser_build_opt.add_argument("--aligner-options", - type=str, - nargs="+", - help="list of strings indicating options you would like passed to aligner" - +"strongly do not recommand to change unless you know what you are doing. " - +"the default is to align reads without filteration.") - - parser_build_opt.add_argument("--read-fastq-command", - type=str, - default=None, - help="command line to execute for each of the input file. This command " - +"should generate FASTQ text and send it to stdout. For example, " - +"--read-fastq-command should be zcat, bzcat and cat for .gz, .bz2 and " - +"plain fastq file respectively") - - parser_build_opt.add_argument("--min-cov", - type=int, - default=0, - help="min number of fragments per barcode. barcodes of total fragments less than " - +"--min-cov will be filreted before alingment. Note: though this feature is included, " - +"we found it barely benefit anything, recommand to set it 0.") - - parser_build_opt.add_argument("--num-threads", - type=int, - default=1, - help="number of alignment threads, also number of threads for sorting " - +"a bam file.") - - parser_build_opt.add_argument("--if-sort", - type=str2bool, - default=True, - help="weather to sort the bam file based on the read name") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="directory to store temporary files. If not given, snaptools will automatically" - + "generate a temporary location to store temporary files") - - parser_build_opt.add_argument("--overwrite", - type=str2bool, - default=False, - help="whether to overwrite the output file if it already exists") - - parser_build_opt.add_argument("--verbose", - type=str2bool, - default=True, - help="a boolen tag indicates output the progress.") + parser_build = subparsers.add_parser( + "align-paired-end", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Align paired-end reads.", + ) + + # add options + parser_build_req = parser_build.add_argument_group("required inputs") + parser_build_req.add_argument( + "--input-reference", + type=str, + required=True, + help="reference genome file contains the reference genome that reads are mapped against, " + + "the genome index must be under the same folder", + ) + + parser_build_req.add_argument( + "--input-fastq1", + type=str, + required=True, + help="fastq file contains R1 reads, currently supports fastq, gz, bz2 file", + ) + + parser_build_req.add_argument( + "--input-fastq2", + type=str, + required=True, + help="fastq file contains R2 reads, currently supports fastq, gz, bz2 file", + ) + + parser_build_req.add_argument( + "--output-bam", + type=str, + required=True, + help="output bam file contains unfiltered alignments", + ) + + parser_build_opt = parser_build.add_argument_group("optional inputs") + parser_build_opt.add_argument( + "--aligner", + type=str, + default="bwa", + help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.", + ) + + parser_build_opt.add_argument( + "--path-to-aligner", + type=str, + default=None, + help="path to fold that contains bwa", + ) + + parser_build_opt.add_argument( + "--aligner-options", + type=str, + nargs="+", + help="list of strings indicating options you would like passed to aligner" + + "strongly do not recommand to change unless you know what you are doing. " + + "the default is to align reads without filteration.", + ) + + parser_build_opt.add_argument( + "--read-fastq-command", + type=str, + default=None, + help="command line to execute for each of the input file. This command " + + "should generate FASTQ text and send it to stdout. For example, " + + "--read-fastq-command should be zcat, bzcat and cat for .gz, .bz2 and " + + "plain fastq file respectively", + ) + + parser_build_opt.add_argument( + "--min-cov", + type=int, + default=0, + help="min number of fragments per barcode. barcodes of total fragments less than " + + "--min-cov will be filreted before alingment. Note: though this feature is included, " + + "we found it barely benefit anything, recommand to set it 0.", + ) + + parser_build_opt.add_argument( + "--num-threads", + type=int, + default=1, + help="number of alignment threads, also number of threads for sorting " + + "a bam file.", + ) + + parser_build_opt.add_argument( + "--if-sort", + type=str2bool, + default=True, + help="weather to sort the bam file based on the read name", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="directory to store temporary files. If not given, snaptools will automatically" + + "generate a temporary location to store temporary files", + ) + + parser_build_opt.add_argument( + "--overwrite", + type=str2bool, + default=False, + help="whether to overwrite the output file if it already exists", + ) + + parser_build_opt.add_argument( + "--verbose", + type=str2bool, + default=True, + help="a boolen tag indicates output the progress.", + ) + def add_align_se_subparser(subparsers): parser_build = subparsers.add_parser( - "align-single-end", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Align single-end reads.") - + "align-single-end", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Align single-end reads.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--input-reference", - type=str, - required=True, - help="reference genome file contains the reference genome that reads are mapped against, " - + "the genome index must be under the same folder") - - parser_build_req.add_argument("--input-fastq1", - type=str, - required=True, - help="fastq file contains R1 reads, currently supports fastq, gz, bz2 file") - - parser_build_req.add_argument("--output-bam", - type=str, - required=True, - help="output bam file contains unfiltered alignments") - + parser_build_req.add_argument( + "--input-reference", + type=str, + required=True, + help="reference genome file contains the reference genome that reads are mapped against, " + + "the genome index must be under the same folder", + ) + + parser_build_req.add_argument( + "--input-fastq1", + type=str, + required=True, + help="fastq file contains R1 reads, currently supports fastq, gz, bz2 file", + ) + + parser_build_req.add_argument( + "--output-bam", + type=str, + required=True, + help="output bam file contains unfiltered alignments", + ) + parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--aligner", - type=str, - default="bwa", - help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.") - - parser_build_opt.add_argument("--path-to-aligner", - type=str, - default=None, - help="path to fold that contains bwa") - - parser_build_opt.add_argument("--aligner-options", - type=str, - nargs="+", - help="list of strings indicating options you would like passed to aligner" - + "strongly do not recommand to change unless you know what you are doing.") - - parser_build_opt.add_argument("--read-fastq-command", - type=str, - default=None, - help="command line to execute for each of the input file. This command" - +"should generate FASTA or FASTQ text and send it to stdout") - - parser_build_opt.add_argument("--num-threads", - type=int, - default=1, - help="number of alignment threads") - - parser_build_opt.add_argument("--min-cov", - type=int, - default=0, - help="min number of fragments per barcode. barcodes of total fragments less than " - +"--min-cov will be filreted before alingment. Note: though this feature is included, " - +"we found it barely speed up the process, so recommand to set it to be 0.") - - parser_build_opt.add_argument("--if-sort", - type=str2bool, - default=True, - help="weather to sort the bam file based on the read name") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="directory to store temporary files. If not given, snaptools will automatically" - + "generate a temporary location to store temporary files") - - parser_build_opt.add_argument("--overwrite", - type=str2bool, - default=False, - help="whether to overwrite the output file if it already exists") + parser_build_opt.add_argument( + "--aligner", + type=str, + default="bwa", + help="aligner to use. Currently, snaptools supports bwa, bowtie, bowtie2 and minimap2.", + ) + + parser_build_opt.add_argument( + "--path-to-aligner", + type=str, + default=None, + help="path to fold that contains bwa", + ) + + parser_build_opt.add_argument( + "--aligner-options", + type=str, + nargs="+", + help="list of strings indicating options you would like passed to aligner" + + "strongly do not recommand to change unless you know what you are doing.", + ) + + parser_build_opt.add_argument( + "--read-fastq-command", + type=str, + default=None, + help="command line to execute for each of the input file. This command" + + "should generate FASTA or FASTQ text and send it to stdout", + ) + + parser_build_opt.add_argument( + "--num-threads", type=int, default=1, help="number of alignment threads" + ) + + parser_build_opt.add_argument( + "--min-cov", + type=int, + default=0, + help="min number of fragments per barcode. barcodes of total fragments less than " + + "--min-cov will be filreted before alingment. Note: though this feature is included, " + + "we found it barely speed up the process, so recommand to set it to be 0.", + ) + + parser_build_opt.add_argument( + "--if-sort", + type=str2bool, + default=True, + help="weather to sort the bam file based on the read name", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="directory to store temporary files. If not given, snaptools will automatically" + + "generate a temporary location to store temporary files", + ) + + parser_build_opt.add_argument( + "--overwrite", + type=str2bool, + default=False, + help="whether to overwrite the output file if it already exists", + ) def add_snap_pre_subparser(subparsers): parser_build = subparsers.add_parser( - "snap-pre", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Create a snap file from bam or bed file.") - + "snap-pre", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Create a snap file from bam or bed file.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--input-file", - type=str, - required=True, - help="input bam, bed or bed.gz file.") - - parser_build_req.add_argument("--output-snap", - type=str, - required=True, - help="output snap file.") - - parser_build_req.add_argument("--genome-name", - type=str, - required=True, - help="genome identifier (i.e. hg19, mm10). This tag does not change anything " - +"unless merge or compare multiple snap files.") - - parser_build_req.add_argument("--genome-size", - type=str, - required=True, - help="a txt file contains corresponding genome sizes. It must be in the following " - +"format with the first column the chromsome name and the second column as chromsome " - +"length. This tag does not change anything unless merge or compare multiple snap files.") - + parser_build_req.add_argument( + "--input-file", type=str, required=True, help="input bam, bed or bed.gz file." + ) + + parser_build_req.add_argument( + "--output-snap", type=str, required=True, help="output snap file." + ) + + parser_build_req.add_argument( + "--genome-name", + type=str, + required=True, + help="genome identifier (i.e. hg19, mm10). This tag does not change anything " + + "unless merge or compare multiple snap files.", + ) + + parser_build_req.add_argument( + "--genome-size", + type=str, + required=True, + help="a txt file contains corresponding genome sizes. It must be in the following " + + "format with the first column the chromsome name and the second column as chromsome " + + "length. This tag does not change anything unless merge or compare multiple snap files.", + ) + parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--barcode-file", - type=str, - default=None, - help="a txt file contains pre-selected cell barcodes. " - +"If --barcode-file is given, snaptools will ignore any barcodes not present in the --barcode-file. " - +"If it is None, snaptools will automatically identify barcodes from bam file. " - +"The first column of --barcode-file must be the selected barcodes and the other columns " - +"could be any attributes of the barcode as desired (`ATGCTCTACTAC attr1 att2`). The " - +"attributes, however, will not be kept in the snap file. This tag will be ignored if the output " - +"snap file already exists.") - - parser_build_opt.add_argument("--min-mapq", - type=int, - default=10, - help="min mappability score. Fargments with mappability score less than --min-mapq will be filtered.") - - parser_build_opt.add_argument("--min-flen", - type=int, - default=0, - help="min fragment length. Fragments of length shorted than --min-flen will be filtered.") - - parser_build_opt.add_argument("--max-flen", - type=int, - default=1000, - help="max fragment length. Fragments of length longer than --max-flen will be filtered.") - - parser_build_opt.add_argument("--min-cov", - type=int, - default=0, - help="min number of fragments per barcode. barcodes of total fragments fewer than --min-cov will be considered " - +"when creating the cell x bin count matrix. Note: because the vast majority of barcodes contains very few reads, " - +"we found by setting --min-cov, one can remove barcodes of low coverage without wasting time and storage. " - +"Please note that this is not selection of good barcodes for downstream clustering analysis, it is only filteration" - +"of very low-quality barcodes.") - - parser_build_opt.add_argument("--max-num", - type=int, - default=1000000, - help="max number of barcodes to store. Barcodes are sorted based on the coverage and only the top --max-num barcodes " - +"will be stored.") - - parser_build_opt.add_argument("--keep-chrm", - type=str2bool, - default=True, - help="a boolen tag indicates whether to keep fragments mapped to chrM. If set Fasle, fragments " - +"aligned to the mitochondrial sequence will be filtered.") - - parser_build_opt.add_argument("--keep-single", - type=str2bool, - default=True, - help="a boolen tag indicates whether to keep those reads whose mates are not mapped or missing. " - +"If False, unpaired reads will be filtered. If True, unpaired reads will be simply treated " - +"as a fragment. Note: for single-end such as scTHS-seq, --keep-single must be True.") - - parser_build_opt.add_argument("--keep-secondary", - type=str2bool, - default=False, - help="a boolen tag indicates whether to keep secondary alignments. If False, " - +"secondary alignments will be filtered. If True, a secondary alignments will " - +"be treated as fragments just single-end.") - - parser_build_opt.add_argument("--keep-discordant", - type=str2bool, - default=False, - help="a boolen tag indicates whether to keep discordant read pairs.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--overwrite", - type=str2bool, - default=False, - help="a boolen tag indicates whether to overwrite the matrix session if it already exists.") - - parser_build_opt.add_argument("--qc-file", - type=str2bool, - default=True, - help="a boolen tag indicates whether to create a master qc file. This .qc file contains " - +"basic quality control metrics at the bulk level. Quality control is only estimated by " - +"selected barcodes only.") - - parser_build_opt.add_argument("--verbose", - type=str2bool, - default=True, - help="a boolen tag indicates output the progress.") + parser_build_opt.add_argument( + "--barcode-file", + type=str, + default=None, + help="a txt file contains pre-selected cell barcodes. " + + "If --barcode-file is given, snaptools will ignore any barcodes not present in the --barcode-file. " + + "If it is None, snaptools will automatically identify barcodes from bam file. " + + "The first column of --barcode-file must be the selected barcodes and the other columns " + + "could be any attributes of the barcode as desired (`ATGCTCTACTAC attr1 att2`). The " + + "attributes, however, will not be kept in the snap file. This tag will be ignored if the output " + + "snap file already exists.", + ) + + parser_build_opt.add_argument( + "--min-mapq", + type=int, + default=10, + help="min mappability score. Fargments with mappability score less than --min-mapq will be filtered.", + ) + + parser_build_opt.add_argument( + "--min-flen", + type=int, + default=0, + help="min fragment length. Fragments of length shorted than --min-flen will be filtered.", + ) + + parser_build_opt.add_argument( + "--max-flen", + type=int, + default=1000, + help="max fragment length. Fragments of length longer than --max-flen will be filtered.", + ) + + parser_build_opt.add_argument( + "--min-cov", + type=int, + default=0, + help="min number of fragments per barcode. barcodes of total fragments fewer than --min-cov will be considered " + + "when creating the cell x bin count matrix. Note: because the vast majority of barcodes contains very few reads, " + + "we found by setting --min-cov, one can remove barcodes of low coverage without wasting time and storage. " + + "Please note that this is not selection of good barcodes for downstream clustering analysis, it is only filteration" + + "of very low-quality barcodes.", + ) + + parser_build_opt.add_argument( + "--max-num", + type=int, + default=1000000, + help="max number of barcodes to store. Barcodes are sorted based on the coverage and only the top --max-num barcodes " + + "will be stored.", + ) + + parser_build_opt.add_argument( + "--keep-chrm", + type=str2bool, + default=True, + help="a boolen tag indicates whether to keep fragments mapped to chrM. If set Fasle, fragments " + + "aligned to the mitochondrial sequence will be filtered.", + ) + + parser_build_opt.add_argument( + "--keep-single", + type=str2bool, + default=True, + help="a boolen tag indicates whether to keep those reads whose mates are not mapped or missing. " + + "If False, unpaired reads will be filtered. If True, unpaired reads will be simply treated " + + "as a fragment. Note: for single-end such as scTHS-seq, --keep-single must be True.", + ) + + parser_build_opt.add_argument( + "--keep-secondary", + type=str2bool, + default=False, + help="a boolen tag indicates whether to keep secondary alignments. If False, " + + "secondary alignments will be filtered. If True, a secondary alignments will " + + "be treated as fragments just single-end.", + ) + + parser_build_opt.add_argument( + "--keep-discordant", + type=str2bool, + default=False, + help="a boolen tag indicates whether to keep discordant read pairs.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--overwrite", + type=str2bool, + default=False, + help="a boolen tag indicates whether to overwrite the matrix session if it already exists.", + ) + + parser_build_opt.add_argument( + "--qc-file", + type=str2bool, + default=True, + help="a boolen tag indicates whether to create a master qc file. This .qc file contains " + + "basic quality control metrics at the bulk level. Quality control is only estimated by " + + "selected barcodes only.", + ) + + parser_build_opt.add_argument( + "--verbose", + type=str2bool, + default=True, + help="a boolen tag indicates output the progress.", + ) + def add_dump_read_subparser(subparsers): parser_build = subparsers.add_parser( - "dump-fragment", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - #help="Dump fragments of selected barcodes from a snap file.", - add_help=False - ) - + "dump-fragment", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + # help="Dump fragments of selected barcodes from a snap file.", + add_help=False, + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="input snap file.") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="input snap file." + ) - parser_build_req.add_argument("--output-file", - type=str, - required=True, - help="output bed file (supports .bed, .gz and .bz2 format).") + parser_build_req.add_argument( + "--output-file", + type=str, + required=True, + help="output bed file (supports .bed, .gz and .bz2 format).", + ) parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--barcode-file", - type=str, - required=False, - help="a txt file of selected barcodes. If --barcode-file is given, only fragments of " - + "selected barcodes are writen in the output file. Default None, all fragments " - + "will be dumped into the output bed file.") - - parser_build_opt.add_argument("--buffer-size", - type=int, - default=100, - required=False, - help="max number of barcodes be stored in the memory.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store potential temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--overwrite", - type=str2bool, - default=False, - help="overwrite the output file if it already exists.") + parser_build_opt.add_argument( + "--barcode-file", + type=str, + required=False, + help="a txt file of selected barcodes. If --barcode-file is given, only fragments of " + + "selected barcodes are writen in the output file. Default None, all fragments " + + "will be dumped into the output bed file.", + ) + + parser_build_opt.add_argument( + "--buffer-size", + type=int, + default=100, + required=False, + help="max number of barcodes be stored in the memory.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store potential temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--overwrite", + type=str2bool, + default=False, + help="overwrite the output file if it already exists.", + ) + def add_dump_barcode_subparser(subparsers): parser_build = subparsers.add_parser( - "dump-barcode", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - #help="Dump barcodes from a snap file.", - add_help=False - ) - + "dump-barcode", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + # help="Dump barcodes from a snap file.", + add_help=False, + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="input snap file.") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="input snap file." + ) - parser_build_req.add_argument("--output-file", - type=str, - required=True, - help="output txt file.") + parser_build_req.add_argument( + "--output-file", type=str, required=True, help="output txt file." + ) parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--barcode-file", - type=str, - required=False, - help="a txt file of selected barcodes. If --barcode-file is given, only metadata of " - + "selected barcodes are writen in the output file. Default None, meta data of all " - + "barcodes will be dumped into the output file.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store potential temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--overwrite", - type=str2bool, - default=False, - help="overwrite the output file if it already exists.") + parser_build_opt.add_argument( + "--barcode-file", + type=str, + required=False, + help="a txt file of selected barcodes. If --barcode-file is given, only metadata of " + + "selected barcodes are writen in the output file. Default None, meta data of all " + + "barcodes will be dumped into the output file.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store potential temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--overwrite", + type=str2bool, + default=False, + help="overwrite the output file if it already exists.", + ) + def add_snap_bmat_subparser(subparsers): parser_build = subparsers.add_parser( - "snap-add-bmat", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Add cell x bin count matrix to snap file.") - + "snap-add-bmat", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Add cell x bin count matrix to snap file.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="snap file.") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="snap file." + ) parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--bin-size-list", - type=int, - nargs="+", - default=[5000], - help="a list of bin size(s) to create the cell-by-bin count matrix. The genome " - +"will be divided into bins of the equal size of --bin-size-list to create the " - +"cell x bin count matrix. If more than one bin size are given, snaptools will " - +"generate a list of cell x bin matrices of different resolutions and stored in " - +"the same snap file.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--verbose", - type=str2bool, - default=True, - help="a boolen tag indicates output the progress.") + parser_build_opt.add_argument( + "--bin-size-list", + type=int, + nargs="+", + default=[5000], + help="a list of bin size(s) to create the cell-by-bin count matrix. The genome " + + "will be divided into bins of the equal size of --bin-size-list to create the " + + "cell x bin count matrix. If more than one bin size are given, snaptools will " + + "generate a list of cell x bin matrices of different resolutions and stored in " + + "the same snap file.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--verbose", + type=str2bool, + default=True, + help="a boolen tag indicates output the progress.", + ) + def add_snap_pmat_subparser(subparsers): parser_build = subparsers.add_parser( - "snap-add-pmat", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Add cell x peak count matrix to snap file.") - + "snap-add-pmat", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Add cell x peak count matrix to snap file.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="snap file.") - - parser_build_req.add_argument("--peak-file", - type=str, - required=True, - help="bed file contains peaks.") - - parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--buffer-size", - type=int, - default=1000, - required=False, - help="max number of barcodes be stored in the memory.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--verbose", - type=str2bool, - default=True, - help="a boolen tag indicates output the progress.") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="snap file." + ) + + parser_build_req.add_argument( + "--peak-file", type=str, required=True, help="bed file contains peaks." + ) + + parser_build_opt = parser_build.add_argument_group("optional inputs") + parser_build_opt.add_argument( + "--buffer-size", + type=int, + default=1000, + required=False, + help="max number of barcodes be stored in the memory.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--verbose", + type=str2bool, + default=True, + help="a boolen tag indicates output the progress.", + ) + def add_snap_gmat_subparser(subparsers): parser_build = subparsers.add_parser( - "snap-add-gmat", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - help="Add cell x gene count matrix to snap file.") - + "snap-add-gmat", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="Add cell x gene count matrix to snap file.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="snap file.") - - parser_build_req.add_argument("--gene-file", - type=str, - required=True, - help="bed file contains genes.") - - parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--buffer-size", - type=int, - default=1000, - required=False, - help="max number of barcodes be stored in the memory.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - - parser_build_opt.add_argument("--verbose", - type=str2bool, - default=True, - help="a boolen tag indicates output the progress.") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="snap file." + ) + + parser_build_req.add_argument( + "--gene-file", type=str, required=True, help="bed file contains genes." + ) + + parser_build_opt = parser_build.add_argument_group("optional inputs") + parser_build_opt.add_argument( + "--buffer-size", + type=int, + default=1000, + required=False, + help="max number of barcodes be stored in the memory.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + + parser_build_opt.add_argument( + "--verbose", + type=str2bool, + default=True, + help="a boolen tag indicates output the progress.", + ) def add_call_peak_subparser(subparsers): parser_build = subparsers.add_parser( - "call-peak", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - #help="Call peak using selected barcodes.", - add_help=False - ) - + "call-peak", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + # help="Call peak using selected barcodes.", + add_help=False, + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="snap file.") - - parser_build_req.add_argument("--barcode-file", - type=str, - required=True, - help="a file contains selected barcodes.") - - parser_build_req.add_argument("--output-prefix", - type=str, - required=True, - help="prefix of output files.") - - parser_build_req.add_argument("--path-to-macs", - type=str, - required=True, - help="path to fold that contains macs2.") - - parser_build_req.add_argument("--gsize", - type=str, - required=True, - help="effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly") + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="snap file." + ) + + parser_build_req.add_argument( + "--barcode-file", + type=str, + required=True, + help="a file contains selected barcodes.", + ) + + parser_build_req.add_argument( + "--output-prefix", type=str, required=True, help="prefix of output files." + ) + + parser_build_req.add_argument( + "--path-to-macs", + type=str, + required=True, + help="path to fold that contains macs2.", + ) + + parser_build_req.add_argument( + "--gsize", + type=str, + required=True, + help="effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly", + ) parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--buffer-size", - type=int, - default=1000, - required=False, - help="max number of barcodes be stored in the memory.") - - parser_build_opt.add_argument("--macs-options", - type=str, - nargs="+", - help="list of strings indicating options you would like passed to macs2" - +"strongly do not recommand to change unless you know what you are doing. " - +"the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR -call-summits'.") - - parser_build_opt.add_argument("--tmp-folder", - type=str, - default=None, - help="a directory to store temporary files. If not given, snaptools will automatically " - + "generate a temporary location to store temporary files.") - + parser_build_opt.add_argument( + "--buffer-size", + type=int, + default=1000, + required=False, + help="max number of barcodes be stored in the memory.", + ) + + parser_build_opt.add_argument( + "--macs-options", + type=str, + nargs="+", + help="list of strings indicating options you would like passed to macs2" + + "strongly do not recommand to change unless you know what you are doing. " + + "the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR -call-summits'.", + ) + + parser_build_opt.add_argument( + "--tmp-folder", + type=str, + default=None, + help="a directory to store temporary files. If not given, snaptools will automatically " + + "generate a temporary location to store temporary files.", + ) + def add_louvain_subparser(subparsers): parser_build = subparsers.add_parser( - "louvain", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - add_help=False - #help="Louvain communities finding.", - ) - + "louvain", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + add_help=False + # help="Louvain communities finding.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--edge-file", - type=str, - required=True, - help="txt file contains edges and weights.") + parser_build_req.add_argument( + "--edge-file", + type=str, + required=True, + help="txt file contains edges and weights.", + ) - parser_build_req.add_argument("--output-file", - type=str, - required=True, - help="output file name.") + parser_build_req.add_argument( + "--output-file", type=str, required=True, help="output file name." + ) parser_build_opt = parser_build.add_argument_group("optional inputs") - parser_build_opt.add_argument("--resolution", - type=float, - default=1.0, - required=False, - help="resolution for finding communities.") - + parser_build_opt.add_argument( + "--resolution", + type=float, + default=1.0, + required=False, + help="resolution for finding communities.", + ) + def add_snap_del_subparser(subparsers): parser_build = subparsers.add_parser( - "snap-del", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - add_help=True, - help="Delete a session.", - ) - + "snap-del", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + add_help=True, + help="Delete a session.", + ) + # add options parser_build_req = parser_build.add_argument_group("required inputs") - parser_build_req.add_argument("--snap-file", - type=str, - required=True, - help="snap file." - ) - - parser_build_req.add_argument("--session-name", - type=str, - required=True, - help="session to be deleted in snap file. 'AM': cell-by-bin matrix. All cell-by-bin " - +"matrices will be removed. 'PM': cell-by-peak matrix. 'GM': cell-by-gene matrix." - ) - + parser_build_req.add_argument( + "--snap-file", type=str, required=True, help="snap file." + ) + + parser_build_req.add_argument( + "--session-name", + type=str, + required=True, + help="session to be deleted in snap file. 'AM': cell-by-bin matrix. All cell-by-bin " + + "matrices will be removed. 'PM': cell-by-peak matrix. 'GM': cell-by-gene matrix.", + ) diff --git a/snaptools/snap.py b/snaptools/snap.py index f244f4f..875ed9b 100755 --- a/snaptools/snap.py +++ b/snaptools/snap.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -51,7 +52,7 @@ sys.exit(1) try: - import numpy as np + import numpy as np except Exception: print("Package numpy not installed!") sys.exit(1) @@ -80,28 +81,29 @@ def getBinsFromGenomeSize(genome_dict, bin_size): """Create a dictionary contains all bins of the same size across the genome - + Attributes: binSize: bin size (i.e. 5000) - + genomeDict: a dictionary contains chromosome sizes - + Return: A dictionary contains all bins and its index (start from 1) """ - bin_dict = collections.OrderedDict(); - i = 1; + bin_dict = collections.OrderedDict() + i = 1 for _chrom in genome_dict: - for _start in range(1, genome_dict[_chrom], bin_size): - _end = min(_start + bin_size - 1, genome_dict[_chrom]); - _binId = (_chrom , _start, _end); - bin_dict[_binId] = i; - i = i +1; - return bin_dict; + for _start in range(1, genome_dict[_chrom], bin_size): + _end = min(_start + bin_size - 1, genome_dict[_chrom]) + _binId = (_chrom, _start, _end) + bin_dict[_binId] = i + i = i + 1 + return bin_dict + def getBarcodesFromSnap(fname): """Read barcodes from a snap file - + Attributes: fname - a snap-format file @@ -109,35 +111,36 @@ def getBarcodesFromSnap(fname): a dictionary contains all barcode in the snap file """ try: - f = h5py.File(fname, 'r'); + f = h5py.File(fname, "r") except IOError: print("error: unable to open fname, check if it is a snap file") sys.exit(1) - barcode_dict = collections.OrderedDict(); - i = 1; + barcode_dict = collections.OrderedDict() + i = 1 for item in f["BD/name"]: - item = item.decode(); - barcode_dict[item] = qc(); - barcode_dict[item].id = i; - barcode_dict[item].total = f["BD"]["TN"][i-1] - barcode_dict[item].mapped = f["BD"]["UM"][i-1] - barcode_dict[item].single = f["BD"]["SE"][i-1] - barcode_dict[item].secondary = f["BD"]["SA"][i-1] - barcode_dict[item].paired = f["BD"]["PE"][i-1] - barcode_dict[item].proper_paired = f["BD"]["PP"][i-1] - barcode_dict[item].proper_flen = f["BD"]["PL"][i-1] - barcode_dict[item].usable = f["BD"]["US"][i-1] - barcode_dict[item].uniq = f["BD"]["UQ"][i-1] - barcode_dict[item].chrM = f["BD"]["CM"][i-1] - barcode_dict[item].final = f["BD"]["UQ"][i-1] - f["BD"]["CM"][i-1] - i = i + 1; + item = item.decode() + barcode_dict[item] = qc() + barcode_dict[item].id = i + barcode_dict[item].total = f["BD"]["TN"][i - 1] + barcode_dict[item].mapped = f["BD"]["UM"][i - 1] + barcode_dict[item].single = f["BD"]["SE"][i - 1] + barcode_dict[item].secondary = f["BD"]["SA"][i - 1] + barcode_dict[item].paired = f["BD"]["PE"][i - 1] + barcode_dict[item].proper_paired = f["BD"]["PP"][i - 1] + barcode_dict[item].proper_flen = f["BD"]["PL"][i - 1] + barcode_dict[item].usable = f["BD"]["US"][i - 1] + barcode_dict[item].uniq = f["BD"]["UQ"][i - 1] + barcode_dict[item].chrM = f["BD"]["CM"][i - 1] + barcode_dict[item].final = f["BD"]["UQ"][i - 1] - f["BD"]["CM"][i - 1] + i = i + 1 f.close() return barcode_dict + def getBarcodesFromSnapSimple(fname): """Read barcodes from a snap file - + Attributes: fname - a snap-format file @@ -145,140 +148,151 @@ def getBarcodesFromSnapSimple(fname): a dictionary contains barcode without qc """ try: - f = h5py.File(fname, 'r'); + f = h5py.File(fname, "r") except IOError: print("error: unable to open fname, check if it is a snap file") sys.exit(1) - barcode_dict = collections.OrderedDict(); - i = 1; + barcode_dict = collections.OrderedDict() + i = 1 for item in f["BD/name"]: - item = item.decode(); - barcode_dict[item] = i; - i = i + 1; + item = item.decode() + barcode_dict[item] = i + i = i + 1 f.close() return barcode_dict - + + def getBarcodesFromInput(fname, ftype): """Read barcodes from a given input file (bam or bed) - + Attributes: - fname: + fname: A bam or bed file ftype: File format, bed or bam Returns: a dictionary contains all barcode present in the input file - """ + """ # automatically determine the file format if ftype == "bam": - return getBarcodesFromBam(fname); + return getBarcodesFromBam(fname) if ftype == "bed": - return getBarcodesFromBed(fname); + return getBarcodesFromBed(fname) else: - print(("error: unrecognized file format for %s, only support bam or bed file" % ftype)); + print( + ( + "error: unrecognized file format for %s, only support bam or bed file" + % ftype + ) + ) sys.exit(1) + def getBarcodesFromTxt(fname): """Read barcodes from a given txt file - + Attributes: fname: A txt file that contains pre-defined barcodes - + Returns: a dictionary contains barcode in the txt file - """ - barcode_list = []; + """ + barcode_list = [] with open(fname) as fin: for line in fin: - if line.startswith("#"): continue; + if line.startswith("#"): + continue if type(line) is bytes: - line = line.decode(); - barcode = line.split()[0].upper(); - barcode_list.append(barcode); - barocde_num = len(set(barcode_list)); + line = line.decode() + barcode = line.split()[0].upper() + barcode_list.append(barcode) + barocde_num = len(set(barcode_list)) if barocde_num < len(barcode_list): print("warning: duplicate barcodes identified, remove it first") sys.exit(1) - barcode_dict = collections.OrderedDict(list(zip(sorted(set(barcode_list)), \ - list(range(1,(barocde_num+1))) \ - ))); + barcode_dict = collections.OrderedDict( + list(zip(sorted(set(barcode_list)), list(range(1, (barocde_num + 1))))) + ) return barcode_dict + def getBarcodesFromBam(input_bam): """Identify unique barcodes from the bam file - + Args: input_bam: a bam file Returns: A dictionary contains all barcodes, otherwise None """ - - barcode_dict = collections.OrderedDict(); - samfile = pysam.AlignmentFile(input_bam, "rb"); + + barcode_dict = collections.OrderedDict() + samfile = pysam.AlignmentFile(input_bam, "rb") i = 1 for _read in samfile: - barcode = _read.qname.split(":")[0].upper(); + barcode = _read.qname.split(":")[0].upper() if barcode not in barcode_dict: barcode_dict[barcode] = i i = i + 1 - samfile.close(); - return barcode_dict; + samfile.close() + return barcode_dict + def getBarcodesFromBed(input_bed): """Identify unique barcodes from a bed file - + Args: input_bed: a bed file. - + Returns: A dictionary contains all barcodes in the bed file. - + """ - + # check the file type - barcode_dict = collections.OrderedDict(); + barcode_dict = collections.OrderedDict() if snaptools.utilities.file_type(input_bed) == "gz": - fin = gzip.open(input_bed, 'rb'); + fin = gzip.open(input_bed, "rb") elif snaptools.utilities.file_type(input_bed) == "bz2": - fin = bz2.BZ2File(input_bed, 'rb'); + fin = bz2.BZ2File(input_bed, "rb") elif snaptools.utilities.file_type(input_bed) == "txt": - fin = open(input_bed, 'r'); + fin = open(input_bed, "r") else: - print("error: unrecoginized bed file format, only supports .gz, .bz2, .bed") - sys.exit(1) - + print("error: unrecoginized bed file format, only supports .gz, .bz2, .bed") + sys.exit(1) + i = 1 for _read in fin: if type(_read) is bytes: - _read = _read.decode(); - barcode = _read.split()[3].split(":")[0].upper(); + _read = _read.decode() + barcode = _read.split()[3].split(":")[0].upper() if barcode not in barcode_dict: - barcode_dict[barcode] = i; - i = i + 1; + barcode_dict[barcode] = i + i = i + 1 + + fin.close() + return barcode_dict - fin.close(); - return barcode_dict; def getBarcodeCov(barcode_list, input_file, file_format): """ Get barcode coverage from a given input file - + Args: - barcode_dict: + barcode_dict: a list of pre-defined barcodes - input_file: + input_file: a bam or bed file - file_format: + file_format: file format bed or bam - + Returns: a dictionary contains barcode coverage - """ - + """ + if file_format == "bam": return get_barcode_cov_from_bam(barcode_list, input_file) elif file_format == "bed": @@ -286,170 +300,180 @@ def getBarcodeCov(barcode_list, input_file, file_format): else: print(("error: unrecognized file format %s " % file_format)) sys.exit(1) - + + def get_barcode_cov_from_bam(barcode_list, input_bam): """ Get barcode coverage from bam file - + Args: barcode_dict: a list of pre-defined barcodes input_bam: a bam file - + Returns: a dictionary contains barcode coverage - """ - + """ + if not snaptools.utilities.is_bam(input_bam): - print(("error: @get_barcode_cov_from_bam: " + input_bam + " is not a bam file!")); + print( + ("error: @get_barcode_cov_from_bam: " + input_bam + " is not a bam file!") + ) sys.exit(1) - + if len(barcode_list) == 0: - print("error: @get_barcode_cov_from_bam: barcode_list is empty!"); + print("error: @get_barcode_cov_from_bam: barcode_list is empty!") sys.exit(1) - - barcode_dict = collections.defaultdict(lambda : 0); - samfile = pysam.AlignmentFile(input_bam, "rb"); + + barcode_dict = collections.defaultdict(lambda: 0) + samfile = pysam.AlignmentFile(input_bam, "rb") for _read in samfile: - barcode = _read.qname.split(":")[0].upper(); + barcode = _read.qname.split(":")[0].upper() # approximate counting, a read is half fragment - barcode_dict[barcode] += 0.5; - return barcode_dict; + barcode_dict[barcode] += 0.5 + return barcode_dict + def get_barcode_cov_from_bed(barcode_list, input_bed): """ Get barcode coverage from bed file - + Args: ----- - barcode_dict: + barcode_dict: a list of pre-defined barcodes - input_bed: + input_bed: a bed file - + Returns: ------ a dictionary contains barcode coverage - """ - + """ + if len(barcode_list) == 0: - print("error: @get_barcode_cov_from_bam: barcode_list is empty!"); + print("error: @get_barcode_cov_from_bam: barcode_list is empty!") sys.exit(1) - + if file_type(input_bed) == "gz": - fin = gzip.open(input_bed, 'rb'); - barcode_dict = collections.defaultdict(lambda : 0); + fin = gzip.open(input_bed, "rb") + barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: - barcode = _read.decode().split()[3].split(":")[0].upper(); + barcode = _read.decode().split()[3].split(":")[0].upper() # approximate counting, a read is half fragment - barcode_dict[barcode] += 1; + barcode_dict[barcode] += 1 elif file_type(input_bed) == "bz2": - fin = bz2.BZ2File(input_bed, 'r'); - barcode_dict = collections.defaultdict(lambda : 0); + fin = bz2.BZ2File(input_bed, "r") + barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: - barcode = _read.decode().split()[3].split(":")[0].upper(); + barcode = _read.decode().split()[3].split(":")[0].upper() # approximate counting, a read is half fragment - barcode_dict[barcode] += 1; + barcode_dict[barcode] += 1 elif file_type(input_bed) == "txt": - fin = open(input_bed, 'r'); - barcode_dict = collections.defaultdict(lambda : 0); + fin = open(input_bed, "r") + barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: - barcode = _read.split()[3].split(":")[0].upper(); + barcode = _read.split()[3].split(":")[0].upper() # approximate counting, a read is half fragment - barcode_dict[barcode] += 1; + barcode_dict[barcode] += 1 else: - print("error: unrecoginized bed file format, only supports .gz, .bz2, .fastq"); - sys.exit(1) - + print("error: unrecoginized bed file format, only supports .gz, .bz2, .fastq") + sys.exit(1) + fin.close() - return barcode_dict; + return barcode_dict + def group_reads_by_barcode(input_file, file_format): if file_format == "bam": - return group_reads_by_barcode_bam(input_file); + return group_reads_by_barcode_bam(input_file) elif file_format == "bed": - return group_reads_by_barcode_bed(input_file); + return group_reads_by_barcode_bed(input_file) else: - print(("error: unrecoganized file format for %s " % file_format)); - sys.exit(1); - + print(("error: unrecoganized file format for %s " % file_format)) + sys.exit(1) + + def group_reads_by_barcode_bam(input_bam): """ Group reads based on the barcodes - + Args: input_bam: a bam file Returns: Generator that contains reads sharing the same barcode """ - if not os.path.exists(input_bam): - print(("Error @group_reads_by_barcode_bam: " + input_bam + " does not exist!")); - - if not snaptools.utilities.is_bam(input_bam): - print(("Error @group_reads_by_barcode_bam: " + input_bam + " is not a bam file!")); - - read_group_list = []; - pre_barcode = ""; - samfile = pysam.AlignmentFile(input_bam, "rb"); + if not os.path.exists(input_bam): + print(("Error @group_reads_by_barcode_bam: " + input_bam + " does not exist!")) + + if not snaptools.utilities.is_bam(input_bam): + print( + ("Error @group_reads_by_barcode_bam: " + input_bam + " is not a bam file!") + ) + + read_group_list = [] + pre_barcode = "" + samfile = pysam.AlignmentFile(input_bam, "rb") for cur_read in samfile: - cur_barcode = cur_read.qname.split(":")[0]; + cur_barcode = cur_read.qname.split(":")[0] if cur_barcode == pre_barcode: read_group_list.append(cur_read) else: if pre_barcode != "": # return read group yield (x for x in read_group_list) - read_group_list = [cur_read] # add the first read + read_group_list = [cur_read] # add the first read pre_barcode = cur_barcode # reads from the last barcode yield (x for x in read_group_list) + def group_reads_by_barcode_bed(input_bed): """ Group fargments based on the barcodes - + Args: input_bed: a bed file Returns: Generator that contains reads sharing the same barcode """ - if not os.path.exists(input_bed): - print(("Error @group_reads_by_barcode_bam: " + input_bed + " does not exist!")); - - read_group_list = []; - pre_barcode = ""; - + if not os.path.exists(input_bed): + print(("Error @group_reads_by_barcode_bam: " + input_bed + " does not exist!")) + + read_group_list = [] + pre_barcode = "" + if file_type(input_bed) == "gz": - fin = gzip.open(input_bed, 'rb'); + fin = gzip.open(input_bed, "rb") elif file_type(input_bed) == "bz2": - fin = bz2.BZ2File(input_bed, 'r'); + fin = bz2.BZ2File(input_bed, "r") elif file_type(input_bed) == "txt": - fin = open(input_bed, 'r'); + fin = open(input_bed, "r") else: - print("error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq") - sys.exit(1) - + print("error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq") + sys.exit(1) + for cur_read in fin: if type(cur_read) is bytes: - cur_read = cur_read.decode(); + cur_read = cur_read.decode() - cur_barcode = cur_read.split()[3].split(":")[0].upper(); + cur_barcode = cur_read.split()[3].split(":")[0].upper() if cur_barcode == pre_barcode: read_group_list.append(cur_read) else: if pre_barcode != "": # return read group yield (x for x in read_group_list) - read_group_list = [cur_read] # add the first read + read_group_list = [cur_read] # add the first read pre_barcode = cur_barcode # reads from the last barcode yield (x for x in read_group_list) fin.close() + def pairReadsByName(read_list): """ Pair reads based on read names - + Args: read_list: a list of reads that share the same barcode @@ -457,46 +481,47 @@ def pairReadsByName(read_list): Generator contains read pairs from the same fragment and a boolen variable indicates whether it is supplementary alignment """ - # pair up + # pair up for read1 in read_list: # read until read1 is not a supplementary alignment - while(read1.is_supplementary): + while read1.is_supplementary: yield (read1, None, False, True) try: - #print "Warning: skipping ", read1.qname; - read1 = next(read_list); + # print "Warning: skipping ", read1.qname; + read1 = next(read_list) except: - break + break try: - read2 = next(read_list); + read2 = next(read_list) except: - break - while(read2.is_supplementary): + break + while read2.is_supplementary: yield (read2, None, False, True) try: - #print "Warning: skipping ", read2.qname; - read2 = next(read_list); + # print "Warning: skipping ", read2.qname; + read2 = next(read_list) except: - break - if(read1.qname != read2.qname): - while (read1.qname != read2.qname): - yield(read1, None, False, False); - read1 = read2; + break + if read1.qname != read2.qname: + while read1.qname != read2.qname: + yield (read1, None, False, False) + read1 = read2 try: - read2 = next(read_list); - while(read2.is_supplementary): + read2 = next(read_list) + while read2.is_supplementary: try: - #print "Warning: skipping ", read2.qname; - read2 = next(read_list); + # print "Warning: skipping ", read2.qname; + read2 = next(read_list) except: - break + break except: - break; + break yield (read1, read2, True, False) - + + def readPairToFragment(read1, read2, is_secondary): """ convert read pairs to fragments - + Args: read1: R1 read @@ -505,35 +530,45 @@ def readPairToFragment(read1, read2, is_secondary): Generator that contains a fragment object """ try: - read1.qname; - read2.qname; + read1.qname + read2.qname if read1.qname != read2.qname: - sys.exit('read_pair_to_fragment: read1 and read2 name does not match!'); + sys.exit("read_pair_to_fragment: read1 and read2 name does not match!") except ValueError as e: - sys.exit('read_pair_to_fragment: can not get read1 or read2 name!'); - barcode = read1.qname.split(":")[0]; - mapq = min(read1.mapq, read2.mapq); + sys.exit("read_pair_to_fragment: can not get read1 or read2 name!") + barcode = read1.qname.split(":")[0] + mapq = min(read1.mapq, read2.mapq) try: - chrom1 = read1.reference_name; - start1 = read1.reference_start; - strand1 = "-" if read1.is_reverse else "+"; - chrom2 = read2.reference_name; - start2 = read2.reference_start; - strand2 = "-" if read1.is_reverse else "+"; - # it is possible that flen1 is None + chrom1 = read1.reference_name + start1 = read1.reference_start + strand1 = "-" if read1.is_reverse else "+" + chrom2 = read2.reference_name + start2 = read2.reference_start + strand2 = "-" if read1.is_reverse else "+" + # it is possible that flen1 is None flen1 = read1.reference_length if read1.reference_length != None else 0 flen2 = read2.reference_length if read2.reference_length != None else 0 - end1 = start1 + flen1; - end2 = start2 + flen2; - start = min(start1, end1, start2, end2); - end = max(start1, end1, start2, end2); - return fragment(read1.qname, chrom1, start, abs(start - end), mapq, False, is_secondary, read1.is_proper_pair); + end1 = start1 + flen1 + end2 = start2 + flen2 + start = min(start1, end1, start2, end2) + end = max(start1, end1, start2, end2) + return fragment( + read1.qname, + chrom1, + start, + abs(start - end), + mapq, + False, + is_secondary, + read1.is_proper_pair, + ) except ValueError as e: - return fragment(read1.qname, None, None, None, mapq, False, is_secondary, False); + return fragment(read1.qname, None, None, None, mapq, False, is_secondary, False) + def readToFragment(read1, is_secondary): """ convert a single read to fragment - + Args: read1: a single read @@ -541,25 +576,35 @@ def readToFragment(read1, is_secondary): Generator contains a fragment object """ try: - read1.qname; + read1.qname except ValueError as e: - sys.exit('readto_fragment: can not get read1 name!'); + sys.exit("readto_fragment: can not get read1 name!") - barcode = read1.qname.split(":")[0]; - mapq = read1.mapq; + barcode = read1.qname.split(":")[0] + mapq = read1.mapq try: - chrom1 = read1.reference_name; - start1 = read1.reference_start; - strand1 = "-" if read1.is_reverse else "+"; - # it is possible that flen1 is None + chrom1 = read1.reference_name + start1 = read1.reference_start + strand1 = "-" if read1.is_reverse else "+" + # it is possible that flen1 is None flen1 = read1.reference_length if read1.reference_length != None else 0 - end1 = start1 + flen1; - start = min(start1, end1); - end = max(start1, end1); - #(qname, chrom, pos, flen, mapq, is_single, is_secondary, is_proper_pair): - return fragment(read1.qname, chrom1, start, abs(start - end), mapq, True, is_secondary, False); + end1 = start1 + flen1 + start = min(start1, end1) + end = max(start1, end1) + # (qname, chrom, pos, flen, mapq, is_single, is_secondary, is_proper_pair): + return fragment( + read1.qname, + chrom1, + start, + abs(start - end), + mapq, + True, + is_secondary, + False, + ) except ValueError as e: - return fragment(read1.qname, None, None, None, mapq, True, is_secondary, False); + return fragment(read1.qname, None, None, None, mapq, True, is_secondary, False) + class qc(object): """A quality control object that has the following attributes: @@ -583,6 +628,7 @@ class qc(object): isize: average insert size distribution. """ + def __init__(self): """Return a qc object""" self.id = 0 @@ -598,6 +644,7 @@ def __init__(self): self.chrM = 0 self.final = 0 + class fragment(object): """A fragment object that has the following attributes: @@ -618,7 +665,10 @@ class fragment(object): flen: fragment length """ - def __init__(self, qname, chrom, pos, flen, mapq, is_single, is_secondary, is_proper_pair): + + def __init__( + self, qname, chrom, pos, flen, mapq, is_single, is_secondary, is_proper_pair + ): """Return a qc object""" self.qname = qname self.chrom = chrom @@ -644,24 +694,24 @@ def run_snap_view_head(input_snap): # 4. read the header info # 5. read the alignment info # 5. read the reference genome info - # 4. print the header session on the screen - + # 4. print the header session on the screen + if not os.path.exists(input_snap): - print(('error: ' + input_snap + ' does not exist!')); - sys.exit(1); + print(("error: " + input_snap + " does not exist!")) + sys.exit(1) - #if not snaptools.utilities.is_snap(input_snap): + # if not snaptools.utilities.is_snap(input_snap): # print 'error: ' + input_snap + ' is not a snap format file!' # sys.exit(1); - - f = h5py.File(input_snap, 'r'); - + + f = h5py.File(input_snap, "r") + if "HD" not in f: - print((input_snap + " HD session does not exist!")); + print((input_snap + " HD session does not exist!")) sys.exit(1) - + # read the header info - header = collections.defaultdict(); + header = collections.defaultdict() header["MG"] = f["HD"]["MG"].value header["DT"] = f["HD"]["DT"].value header["VN"] = f["HD"]["VN"].value @@ -669,29 +719,30 @@ def run_snap_view_head(input_snap): header["CW"] = f["HD"]["CW"].value # read the alignment info - align_dict = collections.defaultdict(); + align_dict = collections.defaultdict() align_dict["PN"] = f["HD"]["AL"]["PN"].value align_dict["ID"] = f["HD"]["AL"]["ID"].value align_dict["VN"] = f["HD"]["AL"]["VN"].value align_dict["CL"] = f["HD"]["AL"]["CL"].value - + # read the genome info genome_name = f["HD"]["SQ"]["ID"].value - genome_dict = dict(list(zip(f["HD"]["SQ"]["SN"], f["HD"]["SQ"]["SL"]))); - - print(("@HD\tMG:%s"%header["MG"] )); - print(("@HD\tDT:%s"%header["DT"] )); - print(("@HD\tVN:%s"%header["VN"] )); - print(("@HD\tCL:%s"%header["CL"] )); - print(("@HD\tCW:%s"%header["CW"] )); - print(("@AL\tPN:%s"%align_dict["PN"] )); - print(("@AL\tID:%s"%align_dict["ID"] )); - print(("@AL\tVN:%s"%align_dict["VN"] )); - print(("@AL\tVN:%s"%align_dict["CL"] )); - print(("@SQ\tID:%s"%(genome_name))); + genome_dict = dict(list(zip(f["HD"]["SQ"]["SN"], f["HD"]["SQ"]["SL"]))) + + print(("@HD\tMG:%s" % header["MG"])) + print(("@HD\tDT:%s" % header["DT"])) + print(("@HD\tVN:%s" % header["VN"])) + print(("@HD\tCL:%s" % header["CL"])) + print(("@HD\tCW:%s" % header["CW"])) + print(("@AL\tPN:%s" % align_dict["PN"])) + print(("@AL\tID:%s" % align_dict["ID"])) + print(("@AL\tVN:%s" % align_dict["VN"])) + print(("@AL\tVN:%s" % align_dict["CL"])) + print(("@SQ\tID:%s" % (genome_name))) for chr_name in genome_dict: - print(("@SQ\tSN:%s\tSL:%d"%(chr_name, genome_dict[chr_name]))); - f.close(); + print(("@SQ\tSN:%s\tSL:%d" % (chr_name, genome_dict[chr_name]))) + f.close() + def getFragFromBarcode(fname, barcode_list, barcode_dict): """Extract fragments of given barcodes @@ -702,141 +753,145 @@ def getFragFromBarcode(fname, barcode_list, barcode_dict): barcode_list: a list of selected barcodes barcode_dict: a dictionary contains barcodes and attributes - + Return: a list of fragments """ # get the fragments of selected barcodes; - barcode_pos_list = []; - barcode_len_list = []; - f = h5py.File(fname, "r", libver='earliest'); + barcode_pos_list = [] + barcode_len_list = [] + f = h5py.File(fname, "r", libver="earliest") for barcode in barcode_list: - if barcode not in barcode_dict: - print (('warning: barcode %s does not exist in the snap file!' % barcode)); - continue; - barcode_id = barcode_dict[barcode].id; - barcode_pos = f["FM"]["barcodePos"][barcode_id-1] - 1; - barcode_len = f["FM"]["barcodeLen"][barcode_id-1]; - barcode_pos_list.append(range(barcode_pos, barcode_pos + barcode_len)); - barcode_len_list.append(barcode_len); - barcode_pos_list = list(itertools.chain.from_iterable(barcode_pos_list)); - _chroms = [item.decode() for item in f["FM"]["fragChrom"][barcode_pos_list]]; - _chroms = [item for item in f["FM"]["fragChrom"][barcode_pos_list]]; - _start = f["FM"]["fragStart"][barcode_pos_list]; - _end = _start + f["FM"]["fragLen"][barcode_pos_list]; - _barcode = [[barcode] * num for (barcode, num) in zip(barcode_list, barcode_len_list)]; - _barcode = list(itertools.chain.from_iterable(_barcode)); - f.close(); - return(list(zip(_chroms, _start, _end, _barcode))); - -def dump_read(snap_file, - output_file, - buffer_size, - barcode_file, - tmp_folder, - overwrite): + if barcode not in barcode_dict: + print(("warning: barcode %s does not exist in the snap file!" % barcode)) + continue + barcode_id = barcode_dict[barcode].id + barcode_pos = f["FM"]["barcodePos"][barcode_id - 1] - 1 + barcode_len = f["FM"]["barcodeLen"][barcode_id - 1] + barcode_pos_list.append(range(barcode_pos, barcode_pos + barcode_len)) + barcode_len_list.append(barcode_len) + barcode_pos_list = list(itertools.chain.from_iterable(barcode_pos_list)) + _chroms = [item.decode() for item in f["FM"]["fragChrom"][barcode_pos_list]] + _chroms = [item for item in f["FM"]["fragChrom"][barcode_pos_list]] + _start = f["FM"]["fragStart"][barcode_pos_list] + _end = _start + f["FM"]["fragLen"][barcode_pos_list] + _barcode = [ + [barcode] * num for (barcode, num) in zip(barcode_list, barcode_len_list) + ] + _barcode = list(itertools.chain.from_iterable(_barcode)) + f.close() + return list(zip(_chroms, _start, _end, _barcode)) + +def dump_read(snap_file, output_file, buffer_size, barcode_file, tmp_folder, overwrite): """ Dump reads from snap file into a bed file (of selected barcodes) Required Args: -------- - snap_file: + snap_file: a snap format file. - output_file: + output_file: a bed format file contains reads of selected barcodes. - + Optional Args: -------- buffer_size: max number of barcodes whose reads are stored before dumping. - + barcode_file: a txt file contains selected barcodes. - + tmp_folder: a tmp folder that stores potential temoprial files. - + overwrite: a boolen variable indicates whether to overwrite a file if it already exists """ # check if input snap file exists if not os.path.exists(snap_file): - print (('error: %s does not exist!' % snap_file)); - sys.exit(1); + print(("error: %s does not exist!" % snap_file)) + sys.exit(1) # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); + print(("error: input file %s is not a snap file!" % snap_file)) # check the output bed file exists if os.path.exists(output_file): if overwrite == True: - subprocess.check_call(["rm", output_file]); + subprocess.check_call(["rm", output_file]) else: - print(('error: %s already exists, change --overwrite or remove it first!' % output_file)); - sys.exit(1); + print( + ( + "error: %s already exists, change --overwrite or remove it first!" + % output_file + ) + ) + sys.exit(1) # identify the barcodes - barcode_dict_ref = getBarcodesFromSnap(snap_file); - + barcode_dict_ref = getBarcodesFromSnap(snap_file) + if barcode_file is not None: - barcode_dict = getBarcodesFromTxt(barcode_file); + barcode_dict = getBarcodesFromTxt(barcode_file) else: - barcode_dict = barcode_dict_ref; - + barcode_dict = barcode_dict_ref + # check if tmp_folder is given - if(tmp_folder!=None): - if(not os.path.isdir(tmp_folder)): - print(('error: %s does not exist!' % tmp_folder)) - sys.exit(1); - + if tmp_folder != None: + if not os.path.isdir(tmp_folder): + print(("error: %s does not exist!" % tmp_folder)) + sys.exit(1) + # check if FM session exists - f = h5py.File(snap_file, "r", libver='earliest'); + f = h5py.File(snap_file, "r", libver="earliest") if "FM" not in f: - print(("error: FM session does not exit in the snap file!")); - sys.exit(1); - f.close(); - + print(("error: FM session does not exit in the snap file!")) + sys.exit(1) + f.close() + # cut barcodes into small chunks # force it to be capital - barcode_list = snaptools.utilities.chunks([x.upper() for x in list(barcode_dict.keys())], buffer_size); - barcode_list = [list(barcode_chunk) for barcode_chunk in barcode_list]; - nChunk = len(barcode_list); - + barcode_list = snaptools.utilities.chunks( + [x.upper() for x in list(barcode_dict.keys())], buffer_size + ) + barcode_list = [list(barcode_chunk) for barcode_chunk in barcode_list] + nChunk = len(barcode_list) + # write fragments down if output_file.endswith(".gz"): fout = gzip.open(output_file, "wb") # cut the barcode into chunks and write down seperately for i in range(nChunk): # extract the fragment list of given barcodes - frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref); + frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref) for item in frag_list: - if(len(item) > 0): - fout.write(("\t".join(map(str, item)) + "\n").encode('utf-8')); + if len(item) > 0: + fout.write(("\t".join(map(str, item)) + "\n").encode("utf-8")) del frag_list elif output_file.endswith(".bz2"): fout = gzip.open(output_file, "wb") # cut the barcode into chunks and write down seperately for i in range(nChunk): # extract the fragment list of given barcodes - frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref); + frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref) for item in frag_list: - if(len(item) > 0): - fout.write(("\t".join(map(str, item)) + "\n").encode('utf-8')); + if len(item) > 0: + fout.write(("\t".join(map(str, item)) + "\n").encode("utf-8")) del frag_list else: fout = open(output_file, "w") # cut the barcode into chunks and write down seperately for i in range(nChunk): # extract the fragment list of given barcodes - frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref); + frag_list = getFragFromBarcode(snap_file, barcode_list[i], barcode_dict_ref) for item in frag_list: - if(len(item) > 0): - fout.write(("\t".join(map(str, item)) + "\n")); + if len(item) > 0: + fout.write(("\t".join(map(str, item)) + "\n")) del frag_list fout.close() return 0 diff --git a/snaptools/snap_del.py b/snaptools/snap_del.py index 2331787..6687d2f 100644 --- a/snaptools/snap_del.py +++ b/snaptools/snap_del.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -76,50 +77,50 @@ sys.exit(1) -def snap_del(snap_file, - session_name - ): - +def snap_del(snap_file, session_name): """ Delete an existing attribute in a snap file. Args: -------- - snap_file: + snap_file: a snap format file. - session_name: + session_name: attribute to delete ["AM", "GM", "PM", "FM"]. - + """ - + if not os.path.exists(snap_file): - print(('error: ' + snap_file + ' does not exist!')); - sys.exit(1); - + print(("error: " + snap_file + " does not exist!")) + sys.exit(1) + # check if snap_file is a snap-format file - file_format = snaptools.utilities.checkFileFormat(snap_file); + file_format = snaptools.utilities.checkFileFormat(snap_file) if file_format != "snap": - print(("error: input file %s is not a snap file!" % snap_file)); - sys.exit(1); - - fin = h5py.File(snap_file, "r", libver='earliest'); - + print(("error: input file %s is not a snap file!" % snap_file)) + sys.exit(1) + + fin = h5py.File(snap_file, "r", libver="earliest") + if session_name not in list(fin.keys()): - print(("error: --session-name %s does not exist in %s" % (session_name, snap_file))); - sys.exit(1); - - fout_name = tempfile.NamedTemporaryFile(delete=False, dir=None); - fout = h5py.File(fout_name.name, "a", libver='earliest'); - - session_name_list = list(fin.keys()); - session_name_list.remove(session_name); - + print( + ( + "error: --session-name %s does not exist in %s" + % (session_name, snap_file) + ) + ) + sys.exit(1) + + fout_name = tempfile.NamedTemporaryFile(delete=False, dir=None) + fout = h5py.File(fout_name.name, "a", libver="earliest") + + session_name_list = list(fin.keys()) + session_name_list.remove(session_name) + for group_name in session_name_list: - fout.copy(fin[group_name],group_name,shallow=False) - + fout.copy(fin[group_name], group_name, shallow=False) + fin.close() fout.close() - subprocess.check_call("\t".join(["mv", fout_name.name, snap_file]), shell=True); - - + subprocess.check_call("\t".join(["mv", fout_name.name, snap_file]), shell=True) diff --git a/snaptools/snap_pre.py b/snaptools/snap_pre.py index f6b32d1..6399908 100755 --- a/snaptools/snap_pre.py +++ b/snaptools/snap_pre.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -""" +""" The MIT License @@ -25,7 +25,8 @@ """ -import sys, os +import sys +import os import collections import gzip import operator @@ -75,54 +76,56 @@ print("Package pybedtools not installed!") sys.exit(1) -def snap_pre(input_file, - output_snap, - genome_name, - genome_size, - min_mapq, - min_flen, - max_flen, - min_cov, - max_num, - barcode_file, - keep_chrm, - keep_single, - keep_secondary, - keep_discordant, - tmp_folder, - overwrite, - qc_file, - verbose): +def snap_pre( + input_file, + output_snap, + genome_name, + genome_size, + min_mapq, + min_flen, + max_flen, + min_cov, + max_num, + barcode_file, + keep_chrm, + keep_single, + keep_secondary, + keep_discordant, + tmp_folder, + overwrite, + qc_file, + verbose, +): """ Pre-processing to create a snap file from a bam or bed that contains alignments or a bed file that contains fragments. Args: -------- - input_file: - a bed or bam file contains fragments - - output_snap: + input_file: + a bed or bam file contains fragments + + output_snap: output snap file. - + Optional -------- - min_mapq: + min_mapq: min mappability [10]. fragments with mappability less than 10 will be filtered - - min_flen: + + min_flen: min fragment size [0]. fragments of length shorter than min_flen will be filtered - max_flen: + max_flen: max fragment size [1000]. fragments of length bigger than min_flen will be filtered min_cov: - min coverage per barcode. barcodes with sequencing fragments less than min_cov will be filtered before processed + min coverage per barcode. barcodes with sequencing fragments less than min_cov will be filtered before processed max_num: max number of barcodes to be stored. Based on the coverage, top max_barcode barcodes are selected and stored. - - barcode_file: + + barcode_file: a txt file that contains selected barcodes to create count matrix [None] keep_chrm: @@ -136,298 +139,446 @@ def snap_pre(input_file, keep_discordant: a boolen variable indicates whether to keep discordant alingments [False] - - tmp_folder: + + tmp_folder: where to store the temporary files [None]; - - qc_file: + + qc_file: a boolen variable indicates whether to create a master qc file [True]; - verbose: + verbose: a boolen variable indicates whether to output the progress [True]; """ - + if not os.path.exists(input_file): - print(('error: ' + input_file + ' does not exist!')); - sys.exit(1); + print(("error: " + input_file + " does not exist!")) + sys.exit(1) if min_flen < snaptools.global_var.MIN_FRAGMENT_LEN: - print(('error: --min-flen can not be smaller than %s ' % str(snaptools.global_var.MIN_FRAGMENT_LEN))); - sys.exit(1); - + print( + ( + "error: --min-flen can not be smaller than %s " + % str(snaptools.global_var.MIN_FRAGMENT_LEN) + ) + ) + sys.exit(1) + if max_flen > snaptools.global_var.MAX_FRAGMENT_LEN: - print(('error: --max-flen can not be smaller than %s ' % str(snaptools.global_var.MAX_FRAGMENT_LEN))); - sys.exit(1); + print( + ( + "error: --max-flen can not be smaller than %s " + % str(snaptools.global_var.MAX_FRAGMENT_LEN) + ) + ) + sys.exit(1) if barcode_file != None: if not os.path.exists(barcode_file): - print(('error: --barcode-file %s does not exist!' % barcode_file)); - sys.exit(1); - + print(("error: --barcode-file %s does not exist!" % barcode_file)) + sys.exit(1) + if os.path.exists(output_snap): if overwrite == True: - subprocess.check_call(["rm", output_snap]); + subprocess.check_call(["rm", output_snap]) else: - print(('error: %s already exists, change --overwrite!' % output_snap)); - sys.exit(1); - - if(tmp_folder!=None): - if(not os.path.isdir(tmp_folder)): - print(('error: %s does not exist!' % tmp_folder)); - sys.exit(1); + print(("error: %s already exists, change --overwrite!" % output_snap)) + sys.exit(1) + + if tmp_folder != None: + if not os.path.isdir(tmp_folder): + print(("error: %s does not exist!" % tmp_folder)) + sys.exit(1) # automatically determine the file format - file_format = snaptools.utilities.checkFileFormat(input_file); - + file_format = snaptools.utilities.checkFileFormat(input_file) + if file_format != "bed" and file_format != "bam": - print(('error: unrecognized input format %s, only supports bed and bam file!' % input_file)); - sys.exit(1); - + print( + ( + "error: unrecognized input format %s, only supports bed and bam file!" + % input_file + ) + ) + sys.exit(1) + if not os.path.exists(genome_size): - print(('error: ' + genome_size + ' does not exist!')) - sys.exit(1); - + print(("error: " + genome_size + " does not exist!")) + sys.exit(1) + if genome_name not in snaptools.global_var.GENOMELIST: - print(('error: --genome-name unrecoginized genome identifier %s' % str(genome_name))); - print(("Do you mean: %s ?" % ",".join(snaptools.utilities.minEditDistanceString(genome_name, snaptools.global_var.GENOMELIST)))); + print( + ( + "error: --genome-name unrecoginized genome identifier %s" + % str(genome_name) + ) + ) + print( + ( + "Do you mean: %s ?" + % ",".join( + snaptools.utilities.minEditDistanceString( + genome_name, snaptools.global_var.GENOMELIST + ) + ) + ) + ) sys.exit(1) - + # read the reference genome info from given genome size txt file - genome_dict = snaptools.utilities.readGenomeSizeFromTxt(genome_size); - + genome_dict = snaptools.utilities.readGenomeSizeFromTxt(genome_size) + # enforce the following parameters if it is a bed file, assuming the bed file has been filtered if file_format == "bed": - keep_single=False; - keep_secondary=False; - keep_discordant=False; - min_mapq=0; - + keep_single = False + keep_secondary = False + keep_discordant = False + min_mapq = 0 + # check if a file is sorted by read name or queryname if not snaptools.utilities.isQuerynameSorted(input_file, file_format): - print(('error: %s is not a sorted by read name!' % input_file)) - sys.exit(1); - + print(("error: %s is not a sorted by read name!" % input_file)) + sys.exit(1) + # create the header session - header = collections.defaultdict(); - header["MG"] = snaptools.global_var.MAGIC_STRING; - header["DT"] = str(datetime.datetime.now()); - header["VN"] = snaptools.__version__; - header["CL"] = "\t".join(map(str, sys.argv)); - header["CW"] = os.getcwd(); + header = collections.defaultdict() + header["MG"] = snaptools.global_var.MAGIC_STRING + header["DT"] = str(datetime.datetime.now()) + header["VN"] = snaptools.__version__ + header["CL"] = "\t".join(map(str, sys.argv)) + header["CW"] = os.getcwd() # bed file does not contain any alignment information, force it to "NA" - align_dict = snaptools.utilities.readAlignmentInfo(input_file, file_format); - + align_dict = snaptools.utilities.readAlignmentInfo(input_file, file_format) + if barcode_file != None: - barcode_dict = snaptools.snap.getBarcodesFromTxt(barcode_file); + barcode_dict = snaptools.snap.getBarcodesFromTxt(barcode_file) else: - barcode_dict = snaptools.snap.getBarcodesFromInput(input_file, file_format); - barcode_cov = snaptools.snap.getBarcodeCov(list(barcode_dict.keys()), input_file, file_format); - barcode_cov = sorted(list(barcode_cov.items()), key=operator.itemgetter(1), reverse=True); - barcode_cov = barcode_cov[0:min(len(barcode_cov), max_num)]; - barcodes = [key for [key, value] in barcode_cov if value > min_cov]; - barcode_dict = collections.OrderedDict(list(zip(sorted(barcodes), list(range(1,(len(barcodes)+1)))))); - - #if min_cov == 0 and max_num == 10000000: + barcode_dict = snaptools.snap.getBarcodesFromInput(input_file, file_format) + barcode_cov = snaptools.snap.getBarcodeCov( + list(barcode_dict.keys()), input_file, file_format + ) + barcode_cov = sorted( + list(barcode_cov.items()), key=operator.itemgetter(1), reverse=True + ) + barcode_cov = barcode_cov[0 : min(len(barcode_cov), max_num)] + barcodes = [key for [key, value] in barcode_cov if value > min_cov] + barcode_dict = collections.OrderedDict( + list(zip(sorted(barcodes), list(range(1, (len(barcodes) + 1))))) + ) + + # if min_cov == 0 and max_num == 10000000: # barcode_dict = barcode_dict; - #else: + # else: # barcode_cov = snaptools.snap.getBarcodeCov(list(barcode_dict.keys()), input_file, file_format); # barcode_cov = sorted(list(barcode_cov.items()), key=operator.itemgetter(1), reverse=True); # barcode_cov = barcode_cov[0:min(len(barcode_cov), max_num)]; # barcodes = [key for [key, value] in barcode_cov if value > min_cov]; - # barcode_dict = collections.OrderedDict(list(zip(sorted(barcodes), list(range(1,(len(barcodes)+1)))))); - - f = h5py.File(output_snap, "w", libver='earliest'); - f.create_group("HD"); - f.create_dataset("HD/MG", data=np.string_(header["MG"])); - f.create_dataset("HD/VN", data=np.string_(header["VN"])); - f.create_dataset("HD/DT", data=np.string_(header["DT"])); - f.create_dataset("HD/CL", data=np.string_(header["CL"])); - f.create_dataset("HD/CW", data=np.string_(header["CW"])); - + # barcode_dict = collections.OrderedDict(list(zip(sorted(barcodes), list(range(1,(len(barcodes)+1)))))); + + f = h5py.File(output_snap, "w", libver="earliest") + f.create_group("HD") + f.create_dataset("HD/MG", data=np.string_(header["MG"])) + f.create_dataset("HD/VN", data=np.string_(header["VN"])) + f.create_dataset("HD/DT", data=np.string_(header["DT"])) + f.create_dataset("HD/CL", data=np.string_(header["CL"])) + f.create_dataset("HD/CW", data=np.string_(header["CW"])) + # header/alignment - f.create_dataset("HD/AL/PN", data=np.string_(align_dict["PN"])); - f.create_dataset("HD/AL/ID", data=np.string_(align_dict["ID"])); - f.create_dataset("HD/AL/VN", data=np.string_(align_dict["VN"])); - f.create_dataset("HD/AL/CL", data=np.string_(align_dict["CL"])); - - fragment_chrom_ds = f.create_dataset("FM/fragChrom", (0,), - dtype=h5py.special_dtype(vlen=bytes), - maxshape=(None,), chunks=(10**4,), - compression="gzip", compression_opts=9); - - fragment_start_ds = f.create_dataset("FM/fragStart", (0,), - dtype="uint32", - maxshape=(None,), chunks=(10**4,), - compression="gzip", compression_opts=9); - - fragment_len_ds = f.create_dataset("FM/fragLen", (0,), - dtype="uint16", - maxshape=(None,), chunks=(10**4,), - compression="gzip", compression_opts=9); - - fragment_chrom_ds[:] = []; - fragment_start_ds[:] = []; - fragment_len_ds[:] = []; - - check_point = 0; - start_time = time.time(); - qc_dict = collections.defaultdict(qc); - num_barcode = len(barcode_dict); - + f.create_dataset("HD/AL/PN", data=np.string_(align_dict["PN"])) + f.create_dataset("HD/AL/ID", data=np.string_(align_dict["ID"])) + f.create_dataset("HD/AL/VN", data=np.string_(align_dict["VN"])) + f.create_dataset("HD/AL/CL", data=np.string_(align_dict["CL"])) + + fragment_chrom_ds = f.create_dataset( + "FM/fragChrom", + (0,), + dtype=h5py.special_dtype(vlen=bytes), + maxshape=(None,), + chunks=(10 ** 4,), + compression="gzip", + compression_opts=9, + ) + + fragment_start_ds = f.create_dataset( + "FM/fragStart", + (0,), + dtype="uint32", + maxshape=(None,), + chunks=(10 ** 4,), + compression="gzip", + compression_opts=9, + ) + + fragment_len_ds = f.create_dataset( + "FM/fragLen", + (0,), + dtype="uint16", + maxshape=(None,), + chunks=(10 ** 4,), + compression="gzip", + compression_opts=9, + ) + + fragment_chrom_ds[:] = [] + fragment_start_ds[:] = [] + fragment_len_ds[:] = [] + + check_point = 0 + start_time = time.time() + qc_dict = collections.defaultdict(qc) + num_barcode = len(barcode_dict) + for read_group in snaptools.snap.group_reads_by_barcode(input_file, file_format): - frag_list = []; + frag_list = [] if file_format == "bam": - for (read1, read2, is_paired, is_secondary) in pairReadsByName(read_group): + for (read1, read2, is_paired, is_secondary) in pairReadsByName(read_group): if is_paired: - frag = snaptools.snap.readPairToFragment(read1, read2, is_secondary); - else: # supplementary alignments or unmated reads; - frag = snaptools.snap.readToFragment(read1, is_secondary); + frag = snaptools.snap.readPairToFragment(read1, read2, is_secondary) + else: # supplementary alignments or unmated reads; + frag = snaptools.snap.readToFragment(read1, is_secondary) # extract the barcode - barcode = frag.qname.split(":")[0].upper(); + barcode = frag.qname.split(":")[0].upper() # only for printing the progress - check_point += 1; - if verbose and check_point%100000 == 0: - print(("%d\ttags, %s seconds " % (check_point, time.time() - start_time))); - # skip this barcode if it is not in the barcode list - if barcode not in barcode_dict: break + check_point += 1 + if verbose and check_point % 100000 == 0: + print( + ( + "%d\ttags, %s seconds " + % (check_point, time.time() - start_time) + ) + ) + # skip this barcode if it is not in the barcode list + if barcode not in barcode_dict: + break # total number of sequencing fragments (exclude supplementary alignments) if frag.is_secondary == False: - qc_dict[barcode].total += 1; - ## 1. Filter non-uniquely mapped fragments - if frag.mapq < min_mapq: continue; - qc_dict[barcode].mapped += 1; + qc_dict[barcode].total += 1 + # 1. Filter non-uniquely mapped fragments + if frag.mapq < min_mapq: + continue + qc_dict[barcode].mapped += 1 # 2. if it is single read, keep it only if keep_unpaired is true # if it is paired, keep it only if it is approperly paired - if frag.is_single == True: + if frag.is_single == True: if frag.is_secondary: qc_dict[barcode].secondary += 1 - if not keep_secondary: continue; + if not keep_secondary: + continue else: - qc_dict[barcode].single += 1; - if not keep_single: continue; - else: # paired-end reads - qc_dict[barcode].paired += 1; + qc_dict[barcode].single += 1 + if not keep_single: + continue + else: # paired-end reads + qc_dict[barcode].paired += 1 if frag.is_proper_pair: - qc_dict[barcode].proper_paired += 1; + qc_dict[barcode].proper_paired += 1 else: - if not keep_discordant: continue; + if not keep_discordant: + continue # 3. check fragment size if frag.flen > min_flen and frag.flen < max_flen: - qc_dict[barcode].proper_flen += 1; + qc_dict[barcode].proper_flen += 1 else: continue # 4. combine single and paired as fragments - frag_list.append((frag.chrom, frag.pos, frag.pos+frag.flen, barcode)); + frag_list.append((frag.chrom, frag.pos, frag.pos + frag.flen, barcode)) else: for _read in read_group: - elems = _read.split(); - frag = fragment("NA", elems[0], int(elems[1]), int(elems[2]) - int(elems[1]), 60, False, False, True) + elems = _read.split() + frag = fragment( + "NA", + elems[0], + int(elems[1]), + int(elems[2]) - int(elems[1]), + 60, + False, + False, + True, + ) # extract the barcode - barcode = elems[3].split(":")[0].upper(); + barcode = elems[3].split(":")[0].upper() # only for printing the progress - check_point += 1; - if verbose and check_point%100000 == 0: - print(("%d\ttags, %s seconds " % (check_point, time.time() - start_time))); - # skip this barcode if it is not in the barcode list - if barcode not in barcode_dict: break; + check_point += 1 + if verbose and check_point % 100000 == 0: + print( + ( + "%d\ttags, %s seconds " + % (check_point, time.time() - start_time) + ) + ) + # skip this barcode if it is not in the barcode list + if barcode not in barcode_dict: + break # total number of sequencing fragments (exclude supplementary alignments) if frag.is_secondary == False: - qc_dict[barcode].total += 1; - ## 1. Filter non-uniquely mapped fragments - if frag.mapq < min_mapq: continue; - qc_dict[barcode].mapped += 1; + qc_dict[barcode].total += 1 + # 1. Filter non-uniquely mapped fragments + if frag.mapq < min_mapq: + continue + qc_dict[barcode].mapped += 1 # 2. if it is single read, keep it only if keep_unpaired is true # if it is paired, keep it only if it is approperly paired - if frag.is_single == True: + if frag.is_single == True: if frag.is_secondary: qcDict[barcode].secondary += 1 - if not keep_secondary: continue; + if not keep_secondary: + continue else: - qc_dict[barcode].single += 1; - if not keep_single: continue; - else: # paired-end reads - qc_dict[barcode].paired += 1; + qc_dict[barcode].single += 1 + if not keep_single: + continue + else: # paired-end reads + qc_dict[barcode].paired += 1 if frag.is_proper_pair: - qc_dict[barcode].proper_paired += 1; + qc_dict[barcode].proper_paired += 1 else: - if not keep_discordant: continue; + if not keep_discordant: + continue # 3. check fragment size if frag.flen > min_flen and frag.flen < max_flen: - qc_dict[barcode].proper_flen += 1; + qc_dict[barcode].proper_flen += 1 else: continue # 4. combine single and paired as fragments - frag_list.append((frag.chrom, frag.pos, frag.pos+frag.flen, barcode)); - - if barcode not in barcode_dict: continue + frag_list.append((frag.chrom, frag.pos, frag.pos + frag.flen, barcode)) + + if barcode not in barcode_dict: + continue # 5. remove duplicate fragments - qc_dict[barcode].usable = len(frag_list); - frag_list_uniq = set(frag_list); # remove duplicated fragments - qc_dict[barcode].uniq = len(frag_list_uniq); - - ## 6. keep mt reads if keep_chrm is true - if(len(frag_list_uniq) == 0): + qc_dict[barcode].usable = len(frag_list) + frag_list_uniq = set(frag_list) # remove duplicated fragments + qc_dict[barcode].uniq = len(frag_list_uniq) + + # 6. keep mt reads if keep_chrm is true + if len(frag_list_uniq) == 0: qc_dict[barcode].chrM = 0 continue for item in frag_list_uniq: if item[0] == "chrM": - qc_dict[barcode].chrM += 1 + qc_dict[barcode].chrM += 1 if not keep_chrm: - frag_list_uniq = [item for item in frag_list_uniq if item[0] != "chrM"]; + frag_list_uniq = [item for item in frag_list_uniq if item[0] != "chrM"] # just in case the only fragments are chrM - qc_dict[barcode].final = len(frag_list_uniq); - if len(frag_list_uniq) == 0: continue; - - fragment_chrom_ds.resize(fragment_chrom_ds.shape[0]+len(frag_list_uniq), axis=0); - fragment_start_ds.resize(fragment_start_ds.shape[0]+len(frag_list_uniq), axis=0); - fragment_len_ds.resize(fragment_len_ds.shape[0]+len(frag_list_uniq), axis=0); - - fragment_chrom_ds[-len(frag_list_uniq):] = [item[0] for item in frag_list_uniq]; - fragment_start_ds[-len(frag_list_uniq):] = [item[1] for item in frag_list_uniq]; - fragment_len_ds[-len(frag_list_uniq):] = [item[2] - item[1] for item in frag_list_uniq]; - - del frag_list, frag_list_uniq; - + qc_dict[barcode].final = len(frag_list_uniq) + if len(frag_list_uniq) == 0: + continue + + fragment_chrom_ds.resize( + fragment_chrom_ds.shape[0] + len(frag_list_uniq), axis=0 + ) + fragment_start_ds.resize( + fragment_start_ds.shape[0] + len(frag_list_uniq), axis=0 + ) + fragment_len_ds.resize(fragment_len_ds.shape[0] + len(frag_list_uniq), axis=0) + + fragment_chrom_ds[-len(frag_list_uniq) :] = [item[0] for item in frag_list_uniq] + fragment_start_ds[-len(frag_list_uniq) :] = [item[1] for item in frag_list_uniq] + fragment_len_ds[-len(frag_list_uniq) :] = [ + item[2] - item[1] for item in frag_list_uniq + ] + + del frag_list, frag_list_uniq + # genome information - f.create_dataset("HD/SQ/ID", data = genome_name); - f.create_dataset("HD/SQ/SN", data = [np.string_(item) for item in list(genome_dict.keys())]); - f.create_dataset("HD/SQ/SL", data = list(genome_dict.values())); + f.create_dataset("HD/SQ/ID", data=genome_name) + f.create_dataset( + "HD/SQ/SN", data=[np.string_(item) for item in list(genome_dict.keys())] + ) + f.create_dataset("HD/SQ/SL", data=list(genome_dict.values())) # barcode - dt = h5py.special_dtype(vlen=bytes); - f.create_dataset("BD/name",data=[np.string_(item) for item in list(barcode_dict.keys())], dtype=h5py.special_dtype(vlen=bytes), compression="gzip", compression_opts=9); - f.create_dataset("BD/TN", data=[qc_dict[key].total for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/UM", data=[qc_dict[key].mapped for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/SE", data=[qc_dict[key].single for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/SA", data=[qc_dict[key].secondary for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/PE", data=[qc_dict[key].paired for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/PP", data=[qc_dict[key].proper_paired for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/PL", data=[qc_dict[key].proper_flen for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/US", data=[qc_dict[key].usable for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/UQ", data=[qc_dict[key].uniq for key in barcode_dict], dtype="uint32"); - f.create_dataset("BD/CM", data=[qc_dict[key].chrM for key in barcode_dict], dtype="uint32"); - barcode_frag_num = [qc_dict[barcode].final for barcode in barcode_dict]; - barcode_frag_start = np.cumsum(barcode_frag_num) - barcode_frag_num; - f.create_dataset("FM/barcodePos", data=barcode_frag_start + 1, dtype="uint32"); - f.create_dataset("FM/barcodeLen", data=barcode_frag_num, dtype="uint32"); - f.close() + dt = h5py.special_dtype(vlen=bytes) + f.create_dataset( + "BD/name", + data=[np.string_(item) for item in list(barcode_dict.keys())], + dtype=h5py.special_dtype(vlen=bytes), + compression="gzip", + compression_opts=9, + ) + f.create_dataset( + "BD/TN", data=[qc_dict[key].total for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/UM", data=[qc_dict[key].mapped for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/SE", data=[qc_dict[key].single for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/SA", data=[qc_dict[key].secondary for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/PE", data=[qc_dict[key].paired for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/PP", + data=[qc_dict[key].proper_paired for key in barcode_dict], + dtype="uint32", + ) + f.create_dataset( + "BD/PL", data=[qc_dict[key].proper_flen for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/US", data=[qc_dict[key].usable for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/UQ", data=[qc_dict[key].uniq for key in barcode_dict], dtype="uint32" + ) + f.create_dataset( + "BD/CM", data=[qc_dict[key].chrM for key in barcode_dict], dtype="uint32" + ) + barcode_frag_num = [qc_dict[barcode].final for barcode in barcode_dict] + barcode_frag_start = np.cumsum(barcode_frag_num) - barcode_frag_num + f.create_dataset("FM/barcodePos", data=barcode_frag_start + 1, dtype="uint32") + f.create_dataset("FM/barcodeLen", data=barcode_frag_num, dtype="uint32") + f.close() if qc_file: - with open(output_snap+".qc", "w") as fout: - fout.write("Total number of unique barcodes: %d\n"%num_barcode) - fout.write("TN - Total number of fragments: %d\n"%sum([qc_dict[key].total for key in qc_dict])) - fout.write("UM - Total number of uniquely mapped: %d\n"%sum([qc_dict[key].mapped for key in qc_dict])) - fout.write("SE - Total number of single ends: %d\n"%sum([qc_dict[key].single for key in qc_dict])) - fout.write("SA - Total number of secondary alignments: %d\n"%sum([qc_dict[key].secondary for key in qc_dict])) - fout.write("PE - Total number of paired ends: %d\n"%sum([qc_dict[key].paired for key in qc_dict])) - fout.write("PP - Total number of proper paired: %d\n"%sum([qc_dict[key].proper_paired for key in qc_dict])) - fout.write("PL - Total number of proper frag len: %d\n"%sum([qc_dict[key].proper_flen for key in qc_dict])) - fout.write("US - Total number of usable fragments: %d\n"%sum([qc_dict[key].usable for key in qc_dict])) - fout.write("UQ - Total number of unique fragments: %d\n"%sum([qc_dict[key].uniq for key in qc_dict])) - fout.write("CM - Total number of chrM fragments: %d\n"%sum([qc_dict[key].chrM for key in qc_dict])) + with open(output_snap + ".qc", "w") as fout: + fout.write( + "Total number of unique barcodes: %d\n" % num_barcode + ) + fout.write( + "TN - Total number of fragments: %d\n" + % sum([qc_dict[key].total for key in qc_dict]) + ) + fout.write( + "UM - Total number of uniquely mapped: %d\n" + % sum([qc_dict[key].mapped for key in qc_dict]) + ) + fout.write( + "SE - Total number of single ends: %d\n" + % sum([qc_dict[key].single for key in qc_dict]) + ) + fout.write( + "SA - Total number of secondary alignments: %d\n" + % sum([qc_dict[key].secondary for key in qc_dict]) + ) + fout.write( + "PE - Total number of paired ends: %d\n" + % sum([qc_dict[key].paired for key in qc_dict]) + ) + fout.write( + "PP - Total number of proper paired: %d\n" + % sum([qc_dict[key].proper_paired for key in qc_dict]) + ) + fout.write( + "PL - Total number of proper frag len: %d\n" + % sum([qc_dict[key].proper_flen for key in qc_dict]) + ) + fout.write( + "US - Total number of usable fragments: %d\n" + % sum([qc_dict[key].usable for key in qc_dict]) + ) + fout.write( + "UQ - Total number of unique fragments: %d\n" + % sum([qc_dict[key].uniq for key in qc_dict]) + ) + fout.write( + "CM - Total number of chrM fragments: %d\n" + % sum([qc_dict[key].chrM for key in qc_dict]) + ) return 0 - diff --git a/snaptools/utilities.py b/snaptools/utilities.py index 2c8f104..aa98ef1 100755 --- a/snaptools/utilities.py +++ b/snaptools/utilities.py @@ -1,55 +1,68 @@ import os import sys -import gzip +import gzip import bz2 import pysam import argparse import pybedtools import re -import collections +import collections + def file_type(filename): - if filename.endswith('.gz'): + if filename.endswith(".gz"): return "gz" - if filename.endswith('.bz2'): + if filename.endswith(".bz2"): return "bz2" return "txt" - + + def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): + if v.lower() in ("yes", "true", "t", "y", "1"): return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): + elif v.lower() in ("no", "false", "f", "n", "0"): return False else: - raise argparse.ArgumentTypeError('Boolean value expected.') + raise argparse.ArgumentTypeError("Boolean value expected.") + def checkFileFormat(fname): """Check if a file format based on its extension. - + Args: fname: file name Return: file format bed, bam or snap """ - - if fname.endswith('.bed') or fname.endswith('.bed.gz') or fname.endswith('.bed.bz2'): + + if ( + fname.endswith(".bed") + or fname.endswith(".bed.gz") + or fname.endswith(".bed.bz2") + ): if is_bed(fname): - return "bed"; - elif fname.endswith('.bam') or fname.endswith('.sam'): + return "bed" + elif fname.endswith(".bam") or fname.endswith(".sam"): if is_bam(fname): - return "bam"; - elif fname.endswith('.snap'): - return "snap"; - elif fname.endswith('.gtf.gz') or fname.endswith('.gtf') or fname.endswith('.gff') or fname.endswith('.gff.gz'): + return "bam" + elif fname.endswith(".snap"): + return "snap" + elif ( + fname.endswith(".gtf.gz") + or fname.endswith(".gtf") + or fname.endswith(".gff") + or fname.endswith(".gff.gz") + ): if is_gff(fname): - return "gtf"; + return "gtf" else: return "NA" - + + def is_bed(fname): """Check if a given file is a bed file - + Args: fname: a bed file name @@ -58,7 +71,7 @@ def is_bed(fname): """ try: # open fname using pysam - fileType = pybedtools.BedTool(fname).file_type; + fileType = pybedtools.BedTool(fname).file_type if fileType == "bed": return True except IndexError as e: @@ -66,9 +79,10 @@ def is_bed(fname): return False return False + def is_gff(fname): """Check if a given file is a gff file - + Args: fname: a gff file name @@ -77,7 +91,7 @@ def is_gff(fname): """ try: # open fname using pysam - fileType = pybedtools.BedTool(fname).file_type; + fileType = pybedtools.BedTool(fname).file_type if fileType == "gff": return True except IndexError as e: @@ -85,9 +99,10 @@ def is_gff(fname): return False return False + def is_bam(fname): """Check if a given file is a bam file - + Args: fname: file name @@ -95,7 +110,7 @@ def is_bam(fname): True if file is a bam file, otherwise False """ try: - fileType = pybedtools.BedTool(fname).file_type; + fileType = pybedtools.BedTool(fname).file_type if fileType == "bam": return True except ValueError as e: @@ -103,9 +118,10 @@ def is_bam(fname): return False return True -def isQuerynameSorted(fname, file_type): + +def isQuerynameSorted(fname, file_type): """Check if a given file file is sorted based on the queryname. - + Args: fname: a input file name @@ -113,16 +129,17 @@ def isQuerynameSorted(fname, file_type): True if fname is a bam file sorted by read name, otherwise False """ if file_type == "bam": - return(isBamQuerynameSorted(fname)) + return isBamQuerynameSorted(fname) elif file_type == "bed": - return(isBedQuerynameSorted(fname)) + return isBedQuerynameSorted(fname) else: - print(('error: %s is not a supported file format!' % file_type)); + print(("error: %s is not a supported file format!" % file_type)) sys.exit(1) - + + def isBamQuerynameSorted(fname): """Check if a given bam file is sorted based on the queryname. - + Args: fname: bam file name @@ -130,53 +147,56 @@ def isBamQuerynameSorted(fname): True if fname is a bam file sorted by read name, otherwise False """ if not is_bam(fname): - print(("Error @is_sorted_queryname: " + fname + " is not a bam/sam file!")); - return False; - samfile = pysam.AlignmentFile(fname, "rb"); + print(("Error @is_sorted_queryname: " + fname + " is not a bam/sam file!")) + return False + samfile = pysam.AlignmentFile(fname, "rb") flag = False - header = samfile.header; - if("HD" in header): - if("SO" in header["HD"]): - if(header["HD"]["SO"] == "queryname"): + header = samfile.header + if "HD" in header: + if "SO" in header["HD"]: + if header["HD"]["SO"] == "queryname": flag = True - samfile.close(); + samfile.close() return flag + def isBedQuerynameSorted(fname): """Check if a given bed file is sorted based on the queryname (the 4th column). - + Args: ----- fname: a bed file name Returns: ----- - True if fname is a bed file sorted by read name, otherwise False + True if fname is a bed file sorted by read name, otherwise False """ - + if not is_bed(fname): - print(("Error @is_bed_queryname_sorted: " + fname + " is not a bed file!")); - return False; - + print(("Error @is_bed_queryname_sorted: " + fname + " is not a bed file!")) + return False + # read the first 1 million reads and see if the read name is sorted - fin = pybedtools.BedTool(fname); - flag = True; + fin = pybedtools.BedTool(fname) + flag = True num_instance = 0 for item in fin: - cur_barcode = str(item).split()[3]; + cur_barcode = str(item).split()[3] if num_instance == 0: - prev_barcode = cur_barcode; - num_instance += 1; - if num_instance >= 1000000: break; - if cur_barcode < prev_barcode: - flag = False; - break; + prev_barcode = cur_barcode + num_instance += 1 + if num_instance >= 1000000: + break + if cur_barcode < prev_barcode: + flag = False + break del fin return flag + def read_genome_size_from_bam(fname): """Read genome information from bam file header. - + Args: fname: bam file @@ -184,29 +204,31 @@ def read_genome_size_from_bam(fname): A dictionary contains SQ as key and SL as value, otherwise None """ # first check if fname is a bam file - if not is_bam(fname): - print(("Error @read_genome_size_from_bam: " + fname + " is not a bam file!")); - return None; - samfile = pysam.AlignmentFile(fname, "rb"); - header = samfile.header; - res = None; + if not is_bam(fname): + print(("Error @read_genome_size_from_bam: " + fname + " is not a bam file!")) + return None + samfile = pysam.AlignmentFile(fname, "rb") + header = samfile.header + res = None try: - header["SQ"]; - res = dict([(item["SN"], item["LN"]) for item in header["SQ"]]); + header["SQ"] + res = dict([(item["SN"], item["LN"]) for item in header["SQ"]]) except KeyError as e: res = None samfile.close() return res + def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] + yield l[i : i + n] + def readGenomeSizeFromTxt(fname): """ Read genome information. - + Args: ----- fname: a txt file contains genome information @@ -216,30 +238,31 @@ def readGenomeSizeFromTxt(fname): A dictionary contains SQ as key and SL as value, otherwise None """ # first check if fname is a bam file - res = dict(); + res = dict() with open(fname) as fin: for line in fin: - elems = line.split(); - chr_name = elems[0]; - chr_len = int(elems[1]); - res[chr_name] = chr_len; + elems = line.split() + chr_name = elems[0] + chr_len = int(elems[1]) + res[chr_name] = chr_len return res + def readAlignmentInfo(fname, file_type): """ Read the alingment information from a bam or bed file. - + Args: ----- fname: a bam or bed file. - + file_type: file format. - + Returns: ----- - A dictionary contains alingment details. + A dictionary contains alingment details. """ - + if file_type == "bam": res = readAlignmentInfoFromBam(fname) elif file_type == "bed": @@ -249,17 +272,22 @@ def readAlignmentInfo(fname, file_type): res["VN"] = "NA" res["CL"] = "NA" else: - print(('error: %s is not a supported file format!' % file_type)); + print(("error: %s is not a supported file format!" % file_type)) sys.exit(1) - if "PN" not in res: res["PN"] = "NA"; - if "ID" not in res: res["ID"] = "NA"; - if "VN" not in res: res["VN"] = "NA"; - if "CL" not in res: res["CL"] = "NA"; + if "PN" not in res: + res["PN"] = "NA" + if "ID" not in res: + res["ID"] = "NA" + if "VN" not in res: + res["VN"] = "NA" + if "CL" not in res: + res["CL"] = "NA" return res + def readAlignmentInfoFromBam(fname): """Read alingment info from bam file header. - + Args: ----- fname: a bam file @@ -268,25 +296,26 @@ def readAlignmentInfoFromBam(fname): ----- A dictionary contains alingment info, otherwise None """ - if not is_bam(fname): - print(("Error @read_alignment_info: " + fname + " is not a bam file!")); - return None; + if not is_bam(fname): + print(("Error @read_alignment_info: " + fname + " is not a bam file!")) + return None - samfile = pysam.AlignmentFile(fname, "rb"); - header = samfile.header; - res = None; + samfile = pysam.AlignmentFile(fname, "rb") + header = samfile.header + res = None try: - header["PG"]; + header["PG"] res = header["PG"][0] except KeyError as e: - print(("error: failed to read alingment info from bam file")); + print(("error: failed to read alingment info from bam file")) sys.exit(1) samfile.close() return res - + + def minEditDistanceString(s1, S): """ Find string in list S that has the smallest edit distance to s1 - + Args: s1: inqury string S: A list of strings @@ -299,40 +328,42 @@ def minEditDistanceString(s1, S): for s2 in S: d = editDistance(s1, s2) if d < d_min: - d_min = d; + d_min = d # second find all strings in S that have d_min - s_min = []; + s_min = [] for s2 in S: d = editDistance(s1, s2) if d == d_min: s_min.append(s2) return s_min + def getBinsFromGenomeSize(genome_dict, bin_size): """Create a dictionary contains all bins of the same size across the genome - + Attributes: binSize: bin size (i.e. 5000) - + genomeDict: a dictionary contains chromosome sizes - + Return: A dictionary contains all bins and its index (start from 1) """ - bin_dict = collections.OrderedDict(); - i = 1; + bin_dict = collections.OrderedDict() + i = 1 for _chrom in genome_dict: - for _start in range(1, genome_dict[_chrom], bin_size): - _end = min(_start + bin_size - 1, genome_dict[_chrom]); - _binId = (_chrom , _start, _end); - bin_dict[_binId] = i; - i = i +1; - return bin_dict; + for _start in range(1, genome_dict[_chrom], bin_size): + _end = min(_start + bin_size - 1, genome_dict[_chrom]) + _binId = (_chrom, _start, _end) + bin_dict[_binId] = i + i = i + 1 + return bin_dict + def editDistance(s1, s2): """ Calculate edit distance between 2 strings - + Args: s1: string 1 s2: string 2 @@ -348,11 +379,13 @@ def editDistance(s1, s2): distances = list(range(len(s1) + 1)) for i2, c2 in enumerate(s2): - distances_ = [i2+1] + distances_ = [i2 + 1] for i1, c1 in enumerate(s1): if c1 == c2: distances_.append(distances[i1]) else: - distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances_.append( + 1 + min((distances[i1], distances[i1 + 1], distances_[-1])) + ) distances = distances_ return distances[-1] diff --git a/test.py b/test.py index 6545eee..a4e2663 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,4 @@ import snaptools -if __name__ == '__main__': +if __name__ == "__main__": print "Success" -