Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add --from-file option to sourmash sketch commands #1362

Merged
merged 4 commits into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion doc/sourmash-sketch.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ will be left empty, and `sourmash sig describe` will output `** no name **`.

### Input contents and output signatures

By default, `sourmash sketch` will produce signatures for each input *file*. If the file contains multiple FASTA/FASTQ records, these records will be merged into the output signature.
By default, `sourmash sketch` will produce signatures for each input
*file*. If the file contains multiple FASTA/FASTQ records, these
records will be merged into the output signature. You can provide a
*list of FASTA files* in a text file to `sourmash sketch` by passing
the text file path in via `--from-file`.

If you specify `--singleton`, `sourmash sketch` will produce signatures for each *record*.

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/cli/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def subparser(subparsers):
)
subparser.add_argument(
'--from-file',
help='a file containing a list of signatures file to compare'
help='a text file containing a list of files to load signatures from'
)
subparser.add_argument(
'-f', '--force', action='store_true',
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/cli/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def subparser(subparsers):
)
subparser.add_argument(
'--from-file',
help='a file containing a list of signatures file to load'
help='a text file containing a list of files to load signatures from'
)
subparser.add_argument(
'-q', '--quiet', action='store_true',
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/cli/lca/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def subparser(subparsers):
)
subparser.add_argument(
'--from-file',
help='a file containing a list of signatures file to load'
help='a text file containing a list of files to load signatures from'
)
subparser.add_argument(
'--scaled', metavar='S', default=10000, type=float
Expand Down
6 changes: 5 additions & 1 deletion src/sourmash/cli/sketch/dna.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@ def subparser(subparsers):
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
'filenames', nargs='*', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
'-f', '--force', action='store_true',
help='recompute signatures even if the file exists'
)
subparser.add_argument(
'--from-file',
help='a text file containing a list of sequence files to load'
)
file_args.add_argument(
'-o', '--output',
help='output computed signatures to this file'
Expand Down
6 changes: 5 additions & 1 deletion src/sourmash/cli/sketch/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def subparser(subparsers):
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
'filenames', nargs='*', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
Expand All @@ -33,6 +33,10 @@ def subparser(subparsers):
'-o', '--output',
help='output computed signatures to this file'
)
subparser.add_argument(
'--from-file',
help='a text file containing a list of sequence files to load'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
Expand Down
6 changes: 5 additions & 1 deletion src/sourmash/cli/sketch/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def subparser(subparsers):
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
'filenames', nargs='*', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
Expand All @@ -33,6 +33,10 @@ def subparser(subparsers):
'-o', '--output',
help='output computed signatures to this file'
)
subparser.add_argument(
'--from-file',
help='a text file containing a list of sequence files to load'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
Expand Down
11 changes: 11 additions & 0 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ def __call__(self):
return sigs


def _add_from_file_to_filenames(args):
"Add filenames from --from-file to args.filenames"
from .sourmash_args import load_file_list_of_signatures
if args.from_file:
file_list = load_file_list_of_signatures(args.from_file)
args.filenames.extend(file_list)


def _execute_sketch(args, signatures_factory):
"Once configured, run 'sketch' the same way underneath."
set_quiet(args.quiet)
Expand Down Expand Up @@ -200,6 +208,7 @@ def dna(args):
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)

_add_from_file_to_filenames(args)
_execute_sketch(args, signatures_factory)


Expand Down Expand Up @@ -229,6 +238,7 @@ def protein(args):
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)

_add_from_file_to_filenames(args)
_execute_sketch(args, signatures_factory)


Expand Down Expand Up @@ -258,4 +268,5 @@ def translate(args):
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)

_add_from_file_to_filenames(args)
_execute_sketch(args, signatures_factory)
75 changes: 75 additions & 0 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,26 @@ def test_do_sourmash_sketchdna():
assert str(sig).endswith('short.fa')


def test_do_sourmash_sketchdna_from_file():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')

file_list = os.path.join(location, "filelist.txt")
with open(file_list, 'wt') as fp:
print(testdata1, file=fp)

status, out, err = utils.runscript('sourmash',
['sketch', 'dna',
'--from-file', file_list],
in_directory=location)

sigfile = os.path.join(location, 'short.fa.sig')
assert os.path.exists(sigfile)

sig = next(signature.load_signatures(sigfile))
assert str(sig).endswith('short.fa')


@utils.in_tempdir
def test_do_sourmash_sketchdna_noinput(c):
data = ""
Expand Down Expand Up @@ -534,6 +554,31 @@ def test_do_sketch_translate_multik_with_protein():
assert 10 in ksizes


def test_do_sketch_translate_multik_with_protein_from_file():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')

file_list = os.path.join(location, "filelist.txt")
with open(file_list, 'wt') as fp:
print(testdata1, file=fp)

status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'k=7,k=10,num=500',
'--from-file', file_list],
in_directory=location)
outfile = os.path.join(location, 'short.fa.sig')
assert os.path.exists(outfile)

with open(outfile, 'rt') as fp:
sigdata = fp.read()
siglist = list(signature.load_signatures(sigdata))
Comment on lines +574 to +575
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reasoning behind using this syntax vs load_file_as_signatures?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah! no, not really, I just wrote these tests before load_file_as_signatures was available, and since it's internal test code I'm not as worried about using the public API.

assert len(siglist) == 2
ksizes = set([ x.minhash.ksize for x in siglist ])
assert 7 in ksizes
assert 10 in ksizes


def test_do_sketch_translate_multik_with_dayhoff():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -642,6 +687,36 @@ def test_do_sketch_protein_multik_input():
assert True in moltype


def test_do_sketch_protein_multik_input_from_file():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('ecoli.faa')

file_list = os.path.join(location, "filelist.txt")
with open(file_list, 'wt') as fp:
print(testdata1, file=fp)

status, out, err = utils.runscript('sourmash',
['sketch', 'protein',
'-p', 'k=7,k=10,num=500',
'--from-file', file_list],
in_directory=location)
outfile = os.path.join(location, 'ecoli.faa.sig')
assert os.path.exists(outfile)

with open(outfile, 'rt') as fp:
sigdata = fp.read()
siglist = list(signature.load_signatures(sigdata))
assert len(siglist) == 2
ksizes = set([ x.minhash.ksize for x in siglist ])
assert 7 in ksizes
assert 10 in ksizes

moltype = set([ x.minhash.moltype == 'protein'
for x in siglist ])
assert len(moltype) == 1
assert True in moltype


def test_do_sourmash_sketchdna_multik_outfile():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down