Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate output #43

Merged
merged 16 commits into from
Aug 20, 2021
35 changes: 31 additions & 4 deletions lusSTR/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,20 @@ def sort_table(table):
return sorted_table


def indiv_files(table, input_dir, ext):
output_dir = f'Separated_lusstr_Files/{input_dir}'
try:
os.mkdir(output_dir)
except FileExistsError:
pass
for samp in table['SampleID'].unique():
new_df = table[table['SampleID'] == samp]
new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)


def main(args):
if args.separate and os.path.exists('Separated_lusstr_Files') is False:
os.mkdir('Separated_lusstr_Files')
output_name = os.path.splitext(args.out)[0]
input_name = os.path.splitext(args.input)[0]
autosomal_final_table, autosomal_flank_table, columns = format_table(
Expand All @@ -159,22 +172,36 @@ def main(args):
if args.combine:
if not sex_final_table.empty:
sex_final_table = combine_reads(sex_final_table, columns)
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci.txt')
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
else:
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci_no_combined_reads.txt')
sex_final_table.to_csv(
f'{output_name}_sexloci_no_combined_reads.txt', index=False
)
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if args.separate:
indiv_files(sex_final_table, input_name, '_sexloci.txt')
else:
sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
if not args.uas:
autosomal_flank_table.to_csv(f'{output_name}_flanks_anno.txt', sep='\t', index=False)
if args.combine:
if not autosomal_final_table.empty:
autosomal_final_table = combine_reads(autosomal_final_table, columns)
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
if args.separate:
indiv_files(autosomal_final_table, input_name, '.txt')
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
else:
autosomal_final_table.to_csv(
f'{output_name}_no_combined_reads.txt', sep='\t', index=False
)
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
if args.separate:
indiv_files(autosomal_final_table, input_name, '.txt')
else:
autosomal_final_table.to_csv(args.out, sep='\t', index=False)
12 changes: 11 additions & 1 deletion lusSTR/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def annot_subparser(subparsers):
cli = subparsers.add_parser('annotate')
cli.add_argument(
'-o', '--out', metavar='FILE',
help='file to which output will be written; default is terminal (stdout)'
help='file to which output will be written; default is terminal (stdout). If the '
'--separate flag is used, this will be the name of the directory which the individual '
'files are written to.'
)
cli.add_argument(
'input', help='sample(s) in CSV format; first four columns must be Locus, NumReads, '
Expand All @@ -67,6 +69,10 @@ def annot_subparser(subparsers):
help='Use if including the X and Y STR markers. Separate reports for these markers '
'will be created.'
)
cli.add_argument(
'--separate', action='store_true',
help='This flag will result in the creation of individual output files per sample.'
)


def snps_subparser(subparsers):
Expand All @@ -93,6 +99,10 @@ def snps_subparser(subparsers):
'--uas', action='store_true',
help='Use if sequences have been run through the ForenSeq UAS.'
)
cli.add_argument(
'--separate', action='store_true',
help='This flag will result in the creation of individual output files per sample.'
)


mains = {
Expand Down
2 changes: 2 additions & 0 deletions lusSTR/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def uas_load(inpath, sexloci=False):
sex_strs = pd.DataFrame() if sexloci is True else None
files = glob.glob(os.path.join(inpath, '*.xlsx'))
for filename in sorted(files):
if 'Sample Details' not in filename:
continue
autodata, sexdata = uas_format(filename, sexloci)
auto_strs = auto_strs.append(autodata)
if sexloci is True:
Expand Down
40 changes: 30 additions & 10 deletions lusSTR/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def uas_format(infile, snp_type_arg):
complemented to be reported on the forward strand; and checks that the called allele is one of
two expected alleles for the SNP (and flags any SNP call which is unexpected).
'''
data = uas_load(infile, snp_type_arg)
data_filt = data.loc[data['Reads'] != 0].reset_index(drop=True)
data_filt = uas_load(infile, snp_type_arg).reset_index(drop=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this is where you're retaining alleles with 0 reads.

data_df = []
for j, row in data_filt.iterrows():
snpid = data_filt.iloc[j, 0]
Expand All @@ -74,13 +73,16 @@ def uas_format(infile, snp_type_arg):
forward_strand_allele = complement_base(uas_allele)
else:
forward_strand_allele = uas_allele
if forward_strand_allele in metadata['Alleles']:
if data_filt.loc[j, 'Typed Allele?'] == 'No':
flag = 'Contains untyped allele'
Comment on lines +76 to +77
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the Typed Allele? column refer to whether there were any reads for that allele?

In any case, space and punctuation in column names can be problematic. If you have just added the column in this PR, I'd recommend using IsTyped instead, and boolean values (True/False) rather than "Yes"/"No" strings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or AlleleIsTyped or something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Typed Allele? column is from the Sample Details Report... it doesn't necessarily indicate an allele with 0 reads, but an allele with reads below the various thresholds, so can have a low number of reads as well as 0 (i.e. is the allele considered to be a real allele). The Yes/No is read directly from the Sample Details Report, so I'd prefer to leave that as is, but I can change the column name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Typed Allele? column is from the Sample Details Report

I see. Maybe worth just leaving it in then...

elif forward_strand_allele in metadata['Alleles']:
flag = ''
else:
flag = 'Allele call does not match expected allele!'
row_tmp = [
data_filt.iloc[j, 3], data_filt.iloc[j, 4], data_filt.iloc[j, 5], snpid,
data_filt.iloc[j, 1], forward_strand_allele, uas_allele, snp_type_dict[type], flag
data_filt.loc[j, 'SampleID'], data_filt.loc[j, 'Project'],
data_filt.loc[j, 'Analysis'], snpid, data_filt.loc[j, 'Reads'], forward_strand_allele,
uas_allele, snp_type_dict[type], flag
]
data_df.append(row_tmp)
data_final = pd.DataFrame(data_df, columns=[
Expand Down Expand Up @@ -133,10 +135,10 @@ def parse_snp_table_from_sheet(infile, sheet, snp_type_arg):
file = openpyxl.load_workbook(infile)
file_sheet = file[sheet]
table = pd.DataFrame(file_sheet.values)
offset = table[table.iloc[:, 0] == "Coverage Information"].index.tolist()[0]
offset = table[table.iloc[:, 0] == 'Coverage Information'].index.tolist()[0]
data = table.iloc[offset + 2:]
data.columns = table.iloc[offset + 1]
data = data[['Locus', 'Reads', 'Allele Name']]
data = data[['Locus', 'Reads', 'Allele Name', 'Typed Allele?']]
final_df = pd.DataFrame()
if snp_type_arg == 'all':
final_df = data
Expand Down Expand Up @@ -332,12 +334,30 @@ def snp_call_exception(seq, expected_size, metadata, base):
return base, flag


def indiv_files(table, input_dir, ext):
output_dir = f'Separated_lusstr_Files/{input_dir}'
print(output_dir)
try:
os.mkdir(output_dir)
except FileExistsError:
pass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest:

os.makedirs(output_dir, exist_ok=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

for samp in table['SampleID'].unique():
new_df = table[table['SampleID'] == samp]
new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)


def main(args):
output_name = os.path.splitext(args.out)[0]
if args.uas:
results = uas_format(args.input, args.type)
results.to_csv(args.out, index=False, sep='\t')
if args.separate:
indiv_files(results, output_name, '.txt')
else:
results.to_csv(args.out, index=False, sep='\t')
else:
results, results_combined = strait_razor_format(args.input, args.type)
output_name = os.path.splitext(args.out)[0]
results_combined.to_csv(args.out, index=False, sep='\t')
if args.separate:
indiv_files(results_combined, output_name, '.txt')
else:
results_combined.to_csv(args.out, index=False, sep='\t')
results.to_csv(f'{output_name}_full_output.txt', index=False, sep='\t')
Binary file not shown.
Loading