diff --git a/repo_utils/sub_tests/stratify.sh b/repo_utils/sub_tests/stratify.sh index 812f0a0c..b7f3257a 100644 --- a/repo_utils/sub_tests/stratify.sh +++ b/repo_utils/sub_tests/stratify.sh @@ -1,7 +1,7 @@ # ------------------------------------------------------------ # stratify # ------------------------------------------------------------ -run test_stratify $truv stratify -w \ +run test_stratify $truv stratify \ $INDIR/beds/include.bed \ $INDIR/variants/input1.vcf.gz \ -o $OD/stratify.txt diff --git a/truvari/stratify.py b/truvari/stratify.py index 60c81b5c..7535554a 100644 --- a/truvari/stratify.py +++ b/truvari/stratify.py @@ -1,5 +1,5 @@ """ -Count variants per-region in vcf +Count variants which start and end within each bed region """ import os import logging @@ -30,8 +30,8 @@ def parse_args(args): help="Output bed-like file") parser.add_argument("--header", action="store_true", help="Input regions have header to preserve in output") - parser.add_argument("-w", "--within", action="store_true", - help="Only count variants contained completely within region boundaries") + parser.add_argument("-v", "--complement", action="store_false", + help="Only count variants not within region boundaries") parser.add_argument("--debug", action="store_true", help="Verbose logging") args = parser.parse_args(args) @@ -62,7 +62,7 @@ def count_entries(vcf, chroms, regions, within): return counts -def benchdir_count_entries(benchdir, regions, within=False, threads=4): +def benchdir_count_entries(benchdir, regions, within=True, threads=4): """ Count the number of variants per bed region in Truvari bench directory by state @@ -95,12 +95,12 @@ def stratify_main(cmdargs): regions = pd.read_csv(args.regions, sep='\t', header=read_header) r_list = regions.to_numpy().tolist() # the methods expect lists if os.path.isdir(args.in_vcf): - counts = benchdir_count_entries(args.in_vcf, r_list, args.within) + counts = benchdir_count_entries(args.in_vcf, r_list, args.complement) else: chroms = np.array([_[0] for _ in r_list]) intvs = np.array([[_[1], _[2]] for _ in r_list]) counts = count_entries(pysam.VariantFile( - args.in_vcf), chroms, intvs, args.within) + args.in_vcf), chroms, intvs, args.complement) counts = pd.Series(counts, name="count").to_frame() counts.index = regions.index regions = regions.join(counts)