From f1474e472ce2ac84d35c8aa1a0185c227e351a23 Mon Sep 17 00:00:00 2001 From: huangyh09 Date: Sat, 5 Oct 2019 12:20:10 +0100 Subject: [PATCH] v0.1.7: support donor VCF with different formats for variants --- doc/manual.rst | 9 +++++++-- doc/release.rst | 4 ++++ vireoSNP/utils/vcf_utils.py | 22 ++++++++++++---------- vireoSNP/version.py | 2 +- vireoSNP/vireo.py | 4 ++-- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/doc/manual.rst b/doc/manual.rst index fbc36ce..565f5be 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -32,7 +32,8 @@ to demultiplex scRNA-seq data. vireo -c $CELL_DATA -N $n_donor -o $OUT_DIR -2) with genotype for all samples (GT, GP, or PL) +2) with genotype for all samples (genoTag: GT, GP, or PL; default is PL, please + choose the existing one) :: @@ -41,6 +42,10 @@ to demultiplex scRNA-seq data. Optionally, `-N` can be provided if it is samller than that in DONOR_GT_FILE for finding the relevant subset of donors. + **Note** For efficient loading of donor VCF file, we recommend subset it + ``bcftools view donor.vcf.gz -R cellSNP.cells.vcf.gz -Oz -o sub.vcf.gz`` + Also, add ``-s`` or ``-S`` for subsetting samples. + 3) with genotype for part of the samples (n_donor is larger than that in DONOR_GT_FILE) @@ -63,7 +68,7 @@ Viroe supports the cell data in three formats: Vireo full arguments -==================== +-------------------- Type ``vireo -h`` for details of all arguments: diff --git a/doc/release.rst b/doc/release.rst index b4e1445..b7e3b04 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,10 @@ History ======= +Release v0.1.7 (05/10/2019) +=========================== +* Support donor genotype vcf file with different FORMAT for different variants + Release v0.1.6 (05/10/2019) =========================== * Fix a bug when variants in donor genotype are not in cell vcf file diff --git a/vireoSNP/utils/vcf_utils.py b/vireoSNP/utils/vcf_utils.py index 284a7c6..199697d 100644 --- a/vireoSNP/utils/vcf_utils.py +++ b/vireoSNP/utils/vcf_utils.py @@ -8,7 +8,7 @@ import subprocess import numpy as np -def parse_sample_info(sample_dat, sparse=True): +def parse_sample_info(sample_dat, sparse=True, format_list=None): """ Parse genotype information for each sample Note, it requires the format for each variants to @@ -19,18 +19,19 @@ def parse_sample_info(sample_dat, sparse=True): # require the same format for all variants format_all = [x[0].split(":") for x in sample_dat] - format_list = format_all[0] + if format_list is None: + format_list = format_all[0] - ## sparse matrix requires all keys - format_set_all = [set(x) for x in format_all] - if format_set_all.count(set(format_all[0])) != len(format_all): - print("Error: require the same format for all variants.") - exit() - RV = {} for _key in format_list: RV[_key] = [] if sparse: + ## sparse matrix requires all keys + format_set_all = [set(x) for x in format_all] + if format_set_all.count(set(format_list)) != len(format_all): + print("Error: require the same format for all variants.") + exit() + RV['indices'] = [] RV['indptr'] = [0] RV['shape'] = (len(sample_dat[0][1:]), len(sample_dat)) @@ -64,7 +65,8 @@ def parse_sample_info(sample_dat, sparse=True): return RV -def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True): +def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True, + format_list=None): """ Load whole VCF file ------------------- @@ -121,7 +123,7 @@ def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True): RV["comments"] = comment_lines if load_sample: RV["samples"] = obs_ids - RV["GenoINFO"] = parse_sample_info(obs_dat, sparse=sparse) + RV["GenoINFO"] = parse_sample_info(obs_dat, sparse, format_list) return RV diff --git a/vireoSNP/version.py b/vireoSNP/version.py index 32efefd..283b03a 100644 --- a/vireoSNP/version.py +++ b/vireoSNP/version.py @@ -1 +1 @@ -__version__ = "0.1.6" \ No newline at end of file +__version__ = "0.1.7" \ No newline at end of file diff --git a/vireoSNP/vireo.py b/vireoSNP/vireo.py index 8a58feb..d46bd59 100644 --- a/vireoSNP/vireo.py +++ b/vireoSNP/vireo.py @@ -132,8 +132,8 @@ def main(): n_donor = options.n_donor if options.donor_file is not None: print("[vireo] Loading donor VCF file ...") - donor_vcf = load_VCF(options.donor_file, sparse=False, - biallelic_only=True) + donor_vcf = load_VCF(options.donor_file, biallelic_only=True, + sparse=False, format_list=[options.geno_tag]) if (options.geno_tag not in donor_vcf['GenoINFO']): print("[vireo] No " + options.geno_tag + " tag in donor genotype; " "please try another tag for genotype, e.g., GT")