Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] refactor MinHash.downsample #1458

Merged
merged 4 commits into from
Apr 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,32 +454,39 @@ def downsample(self, *, num=None, scaled=None):
"""Copy this object and downsample new object to either `num` or
`scaled`.
"""
# first, evaluate provided parameters --

# at least one must be specified!
if num is None and scaled is None:
raise ValueError('must specify either num or scaled to downsample')
elif num is not None:
if self.num:
if self.num < num:
raise ValueError("new sample num is higher than current sample num")
else:
max_hash=0
else:
raise ValueError("scaled != 0 - cannot downsample a scaled MinHash this way")

# both cannot be specified
if num is not None and scaled is not None:
raise ValueError('cannot specify both num and scaled')

if num is not None:
# cannot downsample a scaled MinHash with num:
if self.scaled:
raise ValueError("cannot downsample a scaled MinHash using num")
# cannot upsample
if self.num < num:
raise ValueError("new sample num is higher than current sample num")

# acceptable num value? make sure to set max_hash to 0.
max_hash = 0

elif scaled is not None:
# cannot downsample a num MinHash with scaled
if self.num:
raise ValueError("num != 0 - cannot downsample a standard MinHash")
old_scaled = self.scaled
if old_scaled > scaled:
raise ValueError(
"new scaled {} is lower than current sample scaled {}".format(
scaled, old_scaled
)
)
raise ValueError("cannot downsample a num MinHash using scaled")
if self.scaled > scaled:
raise ValueError(f"new scaled {scaled} is lower than current sample scaled {self.scaled}")

# acceptable scaled value? reconfigure max_hash, keep num 0.
max_hash = _get_max_hash_for_scaled(scaled)
num = 0
###

# create new object:
# end checks! create new object:
a = MinHash(
num, self.ksize, self.is_protein, self.dayhoff, self.hp,
self.track_abundance, self.seed, max_hash
Expand Down
2 changes: 1 addition & 1 deletion tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def test_no_downsample_scaled_if_n(track_abundance):
with pytest.raises(ValueError) as excinfo:
mh.downsample(scaled=100000000)

assert 'cannot downsample a standard MinHash' in str(excinfo.value)
assert 'cannot downsample a num MinHash using scaled' in str(excinfo.value)


def test_scaled_num_both(track_abundance):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_jaccard_1(track_abundance):
E2.add_hash(i)

# here the union is [1, 2, 3, 4, 5]
# and the intesection is [1, 2, 3, 4] => 4/5.
# and the intersection is [1, 2, 3, 4] => 4/5.

assert round(E1.jaccard(E2), 2) == round(4 / 5.0, 2)
assert round(E2.jaccard(E1), 2) == round(4 / 5.0, 2)
Expand Down Expand Up @@ -284,4 +284,4 @@ def test_downsample_scaled_with_num():
with pytest.raises(ValueError) as exc:
mh = mh1.downsample(num=500)

assert 'cannot downsample a scaled MinHash this way' in str(exc.value)
assert 'cannot downsample a scaled MinHash using num' in str(exc.value)