Skip to content

Commit

Permalink
chore: add test case for encode with is_start=False (#74)
Browse files Browse the repository at this point in the history
* chore: add test case for encode with is_start=False

* fix: split is_start=False to a different testcase
  • Loading branch information
tomeras91 authored Dec 28, 2023
1 parent 296bda5 commit 77c0a39
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions tests/test_jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ def test_tokenizer_encode_set(tokenizer: JurassicTokenizer, resources_path: Path
), f"Not equal at doc {i}"


def test_tokenizer_encode_set_when_is_start_false(tokenizer: JurassicTokenizer, resources_path: Path):
tokenized_docs_path = resources_path / "200_tokenized_C4_val_docs.jsonl"
with tokenized_docs_path.open("r") as tokenized_docs_file:
for i, tokenized_doc_line in enumerate(tokenized_docs_file.readlines()):
tokenized_doc = json.loads(tokenized_doc_line)
assert tokenized_doc["token_ids_start_false"] == tokenizer.encode(
tokenized_doc["doc_text"], is_start=False
), f"Not equal at doc {i}"


@pytest.mark.parametrize(
ids=[
"when_single_int__should_return_single_str",
Expand Down

0 comments on commit 77c0a39

Please sign in to comment.