Skip to content

Commit

Permalink
Fix chunk size (#10)
Browse files Browse the repository at this point in the history
* Fix chunking of fastq files

* ignore vscode
  • Loading branch information
ndalchau authored Feb 17, 2023
1 parent 7a317bb commit 7ceb8ac
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,7 @@ docs/_build/
# PyBuilder
target/

.vscode

*.sublime-*
.DS_Store
2 changes: 1 addition & 1 deletion abstar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from .core.abstar import run, run_standalone, main, parse_arguments, validate_args
from .core.abstar import run, run_standalone, main, create_parser, validate_args
from .preprocess import fastqc, adapter_trim, quality_trim

from .version import __version__
26 changes: 14 additions & 12 deletions abstar/core/abstar.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import tempfile
import time
import traceback
from typing import Optional
import warnings
import shutil

Expand Down Expand Up @@ -81,7 +82,7 @@
#####################################################################


def parse_arguments(print_help=False):
def create_parser() -> ArgumentParser:
parser = ArgumentParser(prog='abstar', description="VDJ assignment and antibody sequence annotation. Scalable from a single sequence to billions of sequences.")
parser.add_argument('-p', '--project', dest='project_dir', default=None,
help="The data directory, where files will be downloaded (or have previously \
Expand Down Expand Up @@ -192,11 +193,8 @@ def parse_arguments(print_help=False):
Really only useful if you're using an old version of MongoDB.")
parser.add_argument('--quiet', dest='verbose', default=True, action='store_false',
help='If set, suppresses logging and printing progress to screen')
if print_help:
parser.print_help()
else:
args = parser.parse_args()
return args

return parser


class Args(object):
Expand Down Expand Up @@ -242,7 +240,7 @@ def validate_args(args):
if not any([args.project_dir,
args.sequences,
all([any([args.input, args.use_test_data]), args.output, args.temp])]):
parse_arguments(print_help=True)
create_parser().print_help()
sys.exit(1)
# alter output type if abstar is being run interactively
# if args.sequences:
Expand Down Expand Up @@ -597,10 +595,10 @@ def split_file(f, fmt, temp_dir, args):
# unless the input file is an exact multiple of args.chunksize,
# need to write the last few sequences to a split file.
if seq_counter:
file_counter += 1
out_file = os.path.join(temp_dir, '{}_{}'.format(out_prefix, file_counter))
open(out_file, 'w').write('\n' + '\n'.join(sequences))
open(out_file, 'w').write('\n'.join(sequences))
subfiles.append(out_file)
file_counter += 1
logger.info('SEQUENCES: {}'.format(total_seq_counter))
logger.info('JOBS: {}'.format(file_counter))
return subfiles, total_seq_counter
Expand Down Expand Up @@ -1201,9 +1199,13 @@ def main(args):
log_job_stats(seq_count, processed_seq_counts, start_time, vdj_end_time)
return output_files


if __name__ == '__main__':
def run_main(arg_list: Optional[list[str]] = None):
warnings.filterwarnings("ignore")
args = parse_arguments()
args = create_parser().parse_args(args=arg_list)
validate_args(args)
output_dir = main(args)
sys.stdout.write('\n\n')
return output_dir

if __name__ == '__main__':
run_main()
100 changes: 100 additions & 0 deletions abstar/test_data/test.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
@SRR3289700.1 1/1
CCACTNTGAAGATCCAGCGCACACAGCAGGAGGACTCCGCCGTGTATCTCTGTGCCAGCAGCTTAGGGCCAAGGCTAGCGGGGACAAATACGCAGTATTTTGGCAGTGGAACCCAGCTCTCTGTCT
+
AAAAA#EEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEE/AAEEEEEEEEEEEEE
@SRR3289700.2 2/1
CTCTACTCTGAAGATCCAGCCTGCAGTTGCTGCTTGAGATCCTGTTGGTGACGGGTGGATCTGCAGTGTGCTTGGGCTGAGGCTCAGTGGCGGGGCATGCAGAGCCAGAGATCTGATGCTTTACCACGCTGTGCTGGCCTTGGGGGCTGGT
+
AAAAAEEEEEEEEAEEEEEAAE/<EEEEEEAAEEEEEEEEEEEEEEEEEEEE/EEEEEEEAEAEAEEEEEEEAEEAEEEEEEEEEEE<EEEEEEEAEEEAEAEEEEEAEEEEEEEEEAEEEEEAEAEEEEEA<EEEEAE<EA/AEE<AEEE
@SRR3289700.3 3/1
CGCTCAGGCTGGAGTCAGCCATTTTTGCCACCCAGTAAGAGAAAACATGGAATCTATGGGCTGAAAACGCTGTCCCCAGTGTTCTGGTGATTCTTGCCACCTGGGCGAGATTGCTTTTAATTAACCTCTAGGTGCTGACTTTATCAGCCTC
+
AAAAAEEEEAEEEEEE/EEEE/EEEEEEEAEEEEEEEEEEEE6EEEEEEEE6EEE6EEEEEAEEEE/EEEEEEEEEEEEEEAEEEEEEEEEEEEEEEE<EEEEEAEEEEE/AEEE<EEEEEEE/EEEEEEEEEEEEE<EAEEEEE<EE/EE
@SRR3289700.4 4/1
CCACTCTGAAGTTCCAGCGCACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAGAAGAGGGCCGTGGCACTGAAGCTTTCTTTGGACAAGGCACCAGACCTCACAGTTG
+
AAAAAEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEE<EEEEEEEEEAEEEEEE
@SRR3289700.5 5/1
CCACTCTGACGATTCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGTACCCCCGGAGGGGGTTCTTCGGATTTTGGCCCAGGCACCCGGCTGACAGTGC
+
AAAAAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEE<EEEEEEEEEEAEEEEE<EAEEEEA<EEEEE<EEEAEEEEE<EEEE
@SRR3289700.6 6/1
TCGCTCAGGCTGGAGTCGGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAACCCGGGGGGTTCCTATGGCTACACCTTCGGGCCGGGCACCAGACTCACAGTTG
+
AAAAAEEEEEEEEEEEEEEEEAEE/EEEEEEEEAEEEEE/EEEEEAEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEE<EEEEEEEEEEEEEEEEEEE/EEE/EEEEAE
@SRR3289700.7 7/1
CCACTCTGAAGATCCAGCCTGCAGAGCTTGGGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAGCGGGGGTGGATGAGCAGTTCTTCGGGCCAGGGACACGGCTCACCGTGCTGATCGGAAGAGACACGTCTGAACTCCAGTCACGTG
+
AAAAAEAEEEEEEAEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEEEEEEAEEEEAAEEEEEEEE<EEEEEE<AEEEE<AE<EEAEEEEEAEEEEEEEEAEEAE<EEEEAEEE/EEE<EEE6A6AEA
@SRR3289700.8 8/1
CCACTCTGAAGTTCCAGCGCACACTGGCATAGAACGTGCAAAACCAACACAAGCTTTACAAATCAATGCAGCTCTTACTATATGGGACAGACTCACACTTCATCTTTAGGAATTGCCAGTCAGAGCCCAGTGTACTGACCCCATGACCCCT
+
AAAAAEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEAEE<E/EEEEAEEAEEEEEEEEEEEEEEAEEEAEEE/<EAEEEAEA/EE/EEEAEA</EA/EEEEAEEEE/EEA<<EEE6EAEE/6EEEEAEEE6AE6<EAE6/EEAE<A
@SRR3289700.9 9/1
CACCTTGGAGATCCAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTACTCCGGGACAGAAGAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGC
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEE
@SRR3289700.10 10/1
CACTCAGGCTGGTGTCGGCTCCTGGCACCCCGAAGAAGATCCTTTTCCTTTCTGAGTTGAATGAGGTTTAATCACCCTCATTACCTGGTTTACAGTGCCTCTGGGGTGCTTCAAAGCTCTCCTGGCACCAGCAGCGTGGGGAGCCCTCCCT
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAEE/AEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEA/EEEEEEEEE<EEEEEEAE<<A<A6
@SRR3289700.11 11/1
CTCTACTCTGAAGATCCAGCGCACAGGGTGGGGTTGGGAGGTTGGAGAGGGCAGTACCTGGTGTGGGCACGGACAGGGTCGGGGCGGCTCCGGGAATGACCGTGGTGGACAGAGGCCCAGGCACCCGGCTGACCGTAC
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
@SRR3289700.12 12/1
CACTCTGACGATCCAGCGCACAGGCACGAATGAGTTCATGATGAAAGTGAAGTGCCAGGGCATGAGAGTGAAGTGCCGGGCAGGCAGTTCCCTCTGAGCAGCTGTGCCACTACCACTTGACACACACTACATGCTCCTGCAGCAGCCCTGT
+
AAAAAEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAEEEEE/EEEEEEEEEE/AEEEEAEE/6/EEEEEEEEAEEAEEAEEEEEAEEEEEAAE
@SRR3289700.13 13/1
CCACTCTGACGATTCAGCGCACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAGTTACCGGGCTGGCCAAAAACATTCAGTACTTCGGGCCAGGGACACGGCTCACCGTGC
+
AAAAAEEEEEEAAEEEEEAE/EEEEEEEEEEE66AEEAEEEEAEEEEEEAE/E/AEEAAE/EE/EEA/EE<EEEEAEEEEAE/EEEEAEAE/EEEEEE/EE<EE/EAEEAEEEE6E/A<<6EE
@SRR3289700.14 14/1
CCACTCTGACGATTCAGCGCACAGAGCAGCGGGACTCAGCCATGTATCGCTGTGCTAGCAGCCCACCTAGCGGGGGGGGCCCATGAGCAGTTCTTCGGGCCAGGGACCCGGCTCTCAGTGC
+
AAAAAEAAAEEEEEEEEEAEEAEEEEAEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEAEE<E/E/EEEEEEEEEAEEA<EEEEAAEEEEAE/EEE6/EEEEE/E//EEE<EE<EEAE
@SRR3289700.15 15/1
CACTCTGACGATCCAGCGCACAGACTCTCCAAATCATCCTCTCAACTGGAGGAACGTCACCTTCCCTGCAGCCAACGGCACCTTCCTTGATTTTATTACCAGATCCATGGTCCAGCCTGGTGTCTGGTTACTATTTTCCACCTGCTGTAA
+
AAAAAEEEEEEEEE/EAAEEEEEEEEE/EEEEEEAEE//EEEAEEAEE/EE/AEEEEEEAEEEE6/EEAAEEE/AE<EEEE/AAE<AAEE/EAAEEE//EEEEEE6EEEAEEEEEEE6AE/EEAEE</AA</EE<AE6A6A<EE/EEE<A
@SRR3289700.16 16/1
CGCTCAGGCTGGAGTTGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCCTCGAACGGAGGGGGCAAGAGACCCAGTACTTCGGGCCAGGCACGCGGCTCCTGGTGC
+
/AAAAEEEEEEE/EEEEEEEEAEEEEEEEAEEAEEEEEEEEEEEEEEAEEEEEEEEEEEE/6EEEEEEEEEEEEEEEEEEE<EEEEEEEEEEEEEEEEAEEAEEEEEEEEEEAE
@SRR3289700.17 17/1
CCACTCGAAGTTCCAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCCACGGGACGGGCGCTATTCCACCCCTCCACTTCGGGCCGGGCACCAGACTCACAGTTG
+
AAAAAEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEE
@SRR3289700.18 18/1
CTCTACTCTGAAGATCCAGCGCACAGACTCTCCAAATCATCCTCTCAACTGGAGGAACGTCACCTTCCCTGCAGCCCACGGCACCTTCCTTGATTTTATTACCAGATCCATGGTCCAGCCTGGTGTCTGGTTACTATTTTCCACCTGCTGT
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEAEEEEEEEEEEEAEEEEEEEEEEEE<EEEEEEEEEEEEE/AEEEEEEEE
@SRR3289700.19 19/1
GCTCAGGCTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTGTTTTTAGCGGGGGGGCCGAGACCCAGTACTTCGGGCCAGGGACACGGCTCACCGTGC
+
A/AAAEEEEEEEEEEEEAAEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEE/EEEEEA/EEE/EEEEE6AEE/AAEEEEEEEEEEEEEAEEEEAEEEEEAEEEEAEEEEEE
@SRR3289700.20 20/1
CGCTCAGGCTGGAGTCAGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGATTTAACATCATCCTTAGGGGCGTCCTGACTTTCGGGGCCGGCAGCAGGCTGACCGTGCTGCTCGGAAGAGCACACGTCTGAACTCCAGTCAC
+
AA/AAAEAEA/EAEEEEEEE//EEEEE<E</EA/EEEEEEA/E/EEEEEE////AEEAE/EE/EAA/E///////E//A/<E/EEA/EEE/<<E/EEAA</<EAA<AAE/EA6/EAAE</6AAA/EE/EEEE//AEAAA//E/<AEEEE/<
@SRR3289700.21 21/1
CCACTCTGAAGATCCAGCGCACAGACTCTCCAAATCATCCTCTCAACTGGAGGAACGTCACCTTCCCTGCAGCCCACGGCACCTTCCTTGATTTTATTACCAGATCCATGGTCCAGCCTGGTGTCTGGTTACTATTTTCCACCTGCTGTA
+
AAAAAEEEEEEEEEEEEEEEAEEEEEAE<EAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEE/EEEAEAEEEEEEEEEEAEEAEEEEEAEEEEE//</E<EEEEEEAEEEEEE/EEAEAE<<E6EEAAAAEE
@SRR3289700.22 22/1
CACTCTGACGATCCAGCGCACAGACTCTCCAAATCATCCTCTCAACTGGAGGAACGTCACCTTCCCTGCAGCCCACGGCACCTTCCTTGATTTTATTACCAGATCCATGGTCCAGCCTGGTGTCTGGTTACTATTTTCCACCTGCTGTAA
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EAEEAEEEEE
@SRR3289700.23 23/1
CCACTCTGAAGATCCAGCGCACAGAGCCCCCTCCCCAGACCGGGAAGAACAGCCCCCTTTGACCACCACCCTCGGGGCTGGGAAATGAGGAGGGATCTGCTGCCTTCTCCAGGCGCTCCTGCAAATGCAACAAAATGCAACGTGGTTGGGC
+
AAAAAEEEEEEEEEE6EEEEEEEEEEAEEEE/<EEE/A/EEAEE/AE66AA/EEEEE///E/<E/EE/EEE/E/EEE//AE///E///AE//EE/A//E</<///E///E/EE////<AAE//EAAA///E/EEA//<//A//E///<E//
@SRR3289700.24 24/1
CCACTCTGACGATTCAGCGCACAGACTCTCCAAATCATCCTCTCAACTGGAGGAACGTCACCTTCCCTGCAGCCCACGGCACCTTCCTTGATTTTATTACCAGATCCATGGTCCAGCCTGGTGTCTGGTTACTATTTTCCACCTGCTGTA
+
AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEE/EEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA<AAEEEEEEE
@SRR3289700.25 25/1
CTCTACTCTGAAGATCCAGCGCACAGCGGGCAGAAGCTCGAGGAAGCCCCAGCTAGAGACAGCACTTTCTTGTTTTCAGTCACCTTTGCGCTGCAGGTGGTATTAAATAGTGCTTCTTCGGGGACATGGACACGGCTCACCGTGATTATCG
+
AAAA6EEEAEEAEAEE//6AEE//EAEAEEE/EEE//EE6EA/E/EEEAE/AE/EEE/EEEEEEEE/EE<EE///EEEE/<AA/6EAE</E/EEEAEE//EE//EAEA///EEEE/AEA/E<E/E///A/6/E/AA6AAA//6E/6/</E/
12 changes: 12 additions & 0 deletions abstar/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# filename: test_integration.py

import os
import pytest
import sys
import tempfile

from Bio import SeqIO

Expand Down Expand Up @@ -33,3 +35,13 @@ def test_api_integration_bcr_hiv_bnab_lcs():
test_seqs = [Sequence(s) for s in SeqIO.parse(f, 'fasta')]
seqs = abstar.run(*test_seqs)
assert len(seqs) == 207


@pytest.mark.parametrize("num_cores", [1, 4])
def test_chunks(num_cores):
"""Test that a small file is correctly split into chunks, and returns correct set of json files"""
temp_dir = tempfile.mkdtemp()
out_dir = tempfile.mkdtemp()
arg_list = ["-i", "abstar/test_data/test.fastq", "-o", out_dir, "-t", temp_dir, "-r", "tcr", "--num-cores", str(num_cores), "--chunksize", "6"]
abstar.run_main(arg_list)
assert len(os.listdir(os.path.join(temp_dir, "input"))) == len(os.listdir(os.path.join(temp_dir, "json")))
2 changes: 1 addition & 1 deletion bin/abstar
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ import abstar


if __name__ == '__main__':
args = abstar.parse_arguments()
args = abstar.create_parser().parse_args()
abstar.validate_args(args)
output_dir = abstar.run_standalone(args)

0 comments on commit 7ceb8ac

Please sign in to comment.