Skip to content

Commit

Permalink
Separate output (#43)
Browse files Browse the repository at this point in the history
* Updated README and test files with new col names

* Fixed bug with combining reads for sex chr STRs

* Updated test files and test for combining reads

* Updated README

* Added code to remove amelogenin sequences in annotate

* format command can take in single STRaitRazor file

* Updated cli descriptions

* Updated README

* Initial commit

* No longer remove SNPs with missing data

* updated annot script

* updated tests to accomodate not removing missing data

* Updated tests and added test for separating output files

* mkdir change

Co-authored-by: Rebecca Mitchell <[email protected]>
  • Loading branch information
rnmitchell and Rebecca Mitchell authored Aug 20, 2021
1 parent 4eaab0a commit 991761c
Show file tree
Hide file tree
Showing 10 changed files with 499 additions and 307 deletions.
35 changes: 31 additions & 4 deletions lusSTR/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,20 @@ def sort_table(table):
return sorted_table


def indiv_files(table, input_dir, ext):
output_dir = f'Separated_lusstr_Files/{input_dir}'
try:
os.mkdir(output_dir)
except FileExistsError:
pass
for samp in table['SampleID'].unique():
new_df = table[table['SampleID'] == samp]
new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)


def main(args):
if args.separate and os.path.exists('Separated_lusstr_Files') is False:
os.mkdir('Separated_lusstr_Files')
output_name = os.path.splitext(args.out)[0]
input_name = os.path.splitext(args.input)[0]
autosomal_final_table, autosomal_flank_table, columns = format_table(
Expand All @@ -159,22 +172,36 @@ def main(args):
if args.combine:
if not sex_final_table.empty:
sex_final_table = combine_reads(sex_final_table, columns)
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci.txt')
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
else:
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci_no_combined_reads.txt')
sex_final_table.to_csv(
f'{output_name}_sexloci_no_combined_reads.txt', index=False
)
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci.txt')
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if not args.uas:
autosomal_flank_table.to_csv(f'{output_name}_flanks_anno.txt', sep='\t', index=False)
if args.combine:
if not autosomal_final_table.empty:
autosomal_final_table = combine_reads(autosomal_final_table, columns)
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
if args.separate:
indiv_files(autosomal_final_table, input_name, '.txt')
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
else:
autosomal_final_table.to_csv(
f'{output_name}_no_combined_reads.txt', sep='\t', index=False
)
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
if args.separate:
indiv_files(autosomal_final_table, input_name, '.txt')
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
12 changes: 11 additions & 1 deletion lusSTR/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def annot_subparser(subparsers):
cli = subparsers.add_parser('annotate')
cli.add_argument(
'-o', '--out', metavar='FILE',
help='file to which output will be written; default is terminal (stdout)'
help='file to which output will be written; default is terminal (stdout). If the '
'--separate flag is used, this will be the name of the directory which the individual '
'files are written to.'
)
cli.add_argument(
'input', help='sample(s) in CSV format; first four columns must be Locus, NumReads, '
Expand All @@ -67,6 +69,10 @@ def annot_subparser(subparsers):
help='Use if including the X and Y STR markers. Separate reports for these markers '
'will be created.'
)
cli.add_argument(
'--separate', action='store_true',
help='This flag will result in the creation of individual output files per sample.'
)


def snps_subparser(subparsers):
Expand All @@ -93,6 +99,10 @@ def snps_subparser(subparsers):
'--uas', action='store_true',
help='Use if sequences have been run through the ForenSeq UAS.'
)
cli.add_argument(
'--separate', action='store_true',
help='This flag will result in the creation of individual output files per sample.'
)


mains = {
Expand Down
2 changes: 2 additions & 0 deletions lusSTR/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def uas_load(inpath, sexloci=False):
sex_strs = pd.DataFrame() if sexloci is True else None
files = glob.glob(os.path.join(inpath, '*.xlsx'))
for filename in sorted(files):
if 'Sample Details' not in filename:
continue
autodata, sexdata = uas_format(filename, sexloci)
auto_strs = auto_strs.append(autodata)
if sexloci is True:
Expand Down
36 changes: 26 additions & 10 deletions lusSTR/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def uas_format(infile, snp_type_arg):
complemented to be reported on the forward strand; and checks that the called allele is one of
two expected alleles for the SNP (and flags any SNP call which is unexpected).
'''
data = uas_load(infile, snp_type_arg)
data_filt = data.loc[data['Reads'] != 0].reset_index(drop=True)
data_filt = uas_load(infile, snp_type_arg).reset_index(drop=True)
data_df = []
for j, row in data_filt.iterrows():
snpid = data_filt.iloc[j, 0]
Expand All @@ -74,13 +73,16 @@ def uas_format(infile, snp_type_arg):
forward_strand_allele = complement_base(uas_allele)
else:
forward_strand_allele = uas_allele
if forward_strand_allele in metadata['Alleles']:
if data_filt.loc[j, 'Typed Allele?'] == 'No':
flag = 'Contains untyped allele'
elif forward_strand_allele in metadata['Alleles']:
flag = ''
else:
flag = 'Allele call does not match expected allele!'
row_tmp = [
data_filt.iloc[j, 3], data_filt.iloc[j, 4], data_filt.iloc[j, 5], snpid,
data_filt.iloc[j, 1], forward_strand_allele, uas_allele, snp_type_dict[type], flag
data_filt.loc[j, 'SampleID'], data_filt.loc[j, 'Project'],
data_filt.loc[j, 'Analysis'], snpid, data_filt.loc[j, 'Reads'], forward_strand_allele,
uas_allele, snp_type_dict[type], flag
]
data_df.append(row_tmp)
data_final = pd.DataFrame(data_df, columns=[
Expand Down Expand Up @@ -133,10 +135,10 @@ def parse_snp_table_from_sheet(infile, sheet, snp_type_arg):
file = openpyxl.load_workbook(infile)
file_sheet = file[sheet]
table = pd.DataFrame(file_sheet.values)
offset = table[table.iloc[:, 0] == "Coverage Information"].index.tolist()[0]
offset = table[table.iloc[:, 0] == 'Coverage Information'].index.tolist()[0]
data = table.iloc[offset + 2:]
data.columns = table.iloc[offset + 1]
data = data[['Locus', 'Reads', 'Allele Name']]
data = data[['Locus', 'Reads', 'Allele Name', 'Typed Allele?']]
final_df = pd.DataFrame()
if snp_type_arg == 'all':
final_df = data
Expand Down Expand Up @@ -332,12 +334,26 @@ def snp_call_exception(seq, expected_size, metadata, base):
return base, flag


def indiv_files(table, input_dir, ext):
output_dir = f'Separated_lusstr_Files/{input_dir}'
os.makedirs(output_dir, exist_ok=True)
for samp in table['SampleID'].unique():
new_df = table[table['SampleID'] == samp]
new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)


def main(args):
output_name = os.path.splitext(args.out)[0]
if args.uas:
results = uas_format(args.input, args.type)
results.to_csv(args.out, index=False, sep='\t')
if args.separate:
indiv_files(results, output_name, '.txt')
else:
results.to_csv(args.out, index=False, sep='\t')
else:
results, results_combined = strait_razor_format(args.input, args.type)
output_name = os.path.splitext(args.out)[0]
results_combined.to_csv(args.out, index=False, sep='\t')
if args.separate:
indiv_files(results_combined, output_name, '.txt')
else:
results_combined.to_csv(args.out, index=False, sep='\t')
results.to_csv(f'{output_name}_full_output.txt', index=False, sep='\t')
Binary file not shown.
Loading

0 comments on commit 991761c

Please sign in to comment.