Skip to content

Commit

Permalink
Improve the error when passing an empty sentence to the tokenizer (#40)
Browse files Browse the repository at this point in the history
* Add clearer error when an empty sentence is passed

* Update changelog

* Add test case
  • Loading branch information
tomaarsen authored Oct 31, 2023
1 parent eede2a4 commit 38bee88
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ Types of changes

## [Unreleased]

### Changed

- Changed the error when an empty sentence is provided to the tokenizer.

### Fixed

- No longer override `language` metadata from the dataset if the language was also set manually via `SpanMarkerModelCardData`.
Expand Down
5 changes: 4 additions & 1 deletion span_marker/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,10 @@ def __call__(
all_labels = []
all_num_words = []
for sample_idx, input_ids in enumerate(batch_encoding["input_ids"]):
num_words = int(np.nanmax(np.array(batch_encoding.word_ids(sample_idx), dtype=float))) + 1
max_word_ids = np.nanmax(np.array(batch_encoding.word_ids(sample_idx), dtype=float))
if np.isnan(max_word_ids):
raise ValueError("The `SpanMarkerTokenizer` detected an empty sentence, please remove it.")
num_words = int(max_word_ids) + 1
if self.tokenizer.pad_token_id in input_ids:
num_tokens = list(input_ids).index(self.tokenizer.pad_token_id)
else:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ def test_predict_where_first_sentence_is_word(finetuned_conll_span_marker_model:
assert isinstance(outputs[0], list)


def test_predict_empty_error(finetuned_conll_span_marker_model: SpanMarkerModel) -> None:
model = finetuned_conll_span_marker_model.try_cuda()
with pytest.raises(ValueError, match="The `SpanMarkerTokenizer` detected an empty sentence, please remove it."):
model.predict(["One Two", "Three Four Five", ""])


def test_incorrect_predict_inputs(finetuned_conll_span_marker_model: SpanMarkerModel):
model = finetuned_conll_span_marker_model.try_cuda()
with pytest.raises(ValueError, match="could not recognize your input"):
Expand Down

0 comments on commit 38bee88

Please sign in to comment.