From 18794f2e9d3cabeb5012203742faafe770c7b32a Mon Sep 17 00:00:00 2001 From: Union Palenshus Date: Thu, 22 Jun 2023 17:17:44 -0700 Subject: [PATCH] Fixing tokenizers to correctly handle linux line endings (\n) --- src/Base/CasedTokenizer.cs | 13 +------------ src/Base/TokenizerBase.cs | 11 ++++++++--- src/Base/UncasedTokenizer.cs | 8 ++------ tests/BERTTokenizers.Tests.csproj | 2 +- tests/BertBaseTokenizerUncasedShould.cs | 15 ++++++++++++++- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/Base/CasedTokenizer.cs b/src/Base/CasedTokenizer.cs index 80b3a42..3c613e7 100644 --- a/src/Base/CasedTokenizer.cs +++ b/src/Base/CasedTokenizer.cs @@ -1,18 +1,7 @@ -using BERTTokenizers.Extensions; -using System; -using System.Collections.Generic; -using System.Linq; - -namespace BERTTokenizers.Base +namespace BERTTokenizers.Base { public abstract class CasedTokenizer : TokenizerBase { protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { } - - protected override IEnumerable TokenizeSentence(string text) - { - return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None) - .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray())); - } } } diff --git a/src/Base/TokenizerBase.cs b/src/Base/TokenizerBase.cs index 5a21d8f..54f5b10 100644 --- a/src/Base/TokenizerBase.cs +++ b/src/Base/TokenizerBase.cs @@ -1,4 +1,5 @@ -using BERTTokenizers.Helpers; +using BERTTokenizers.Extensions; +using BERTTokenizers.Helpers; using System; using System.Collections.Generic; using System.Linq; @@ -87,6 +88,12 @@ public List Untokenize(List tokens) => (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList(); } + protected virtual IEnumerable TokenizeSentence(string text) + { + return text.Split(new char[0], StringSplitOptions.RemoveEmptyEntries) + .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray())); + } + private IEnumerable SegmentIndex(List<(string token, int index)> tokens) { var segmentIndex = 0; @@ -152,7 +159,5 @@ private IEnumerable SegmentIndex(List<(string token, int index)> tokens) return tokens; } - - protected abstract IEnumerable TokenizeSentence(string text); } } diff --git a/src/Base/UncasedTokenizer.cs b/src/Base/UncasedTokenizer.cs index 1a384ac..2fb0b4b 100644 --- a/src/Base/UncasedTokenizer.cs +++ b/src/Base/UncasedTokenizer.cs @@ -1,6 +1,4 @@ -using BERTTokenizers.Extensions; -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; namespace BERTTokenizers.Base @@ -13,9 +11,7 @@ protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) protected override IEnumerable TokenizeSentence(string text) { - return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None) - .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray())) - .Select(o => o.ToLower()); + return base.TokenizeSentence(text).Select(o => o.ToLower()); } } } diff --git a/tests/BERTTokenizers.Tests.csproj b/tests/BERTTokenizers.Tests.csproj index 916aa4e..c21ab5d 100644 --- a/tests/BERTTokenizers.Tests.csproj +++ b/tests/BERTTokenizers.Tests.csproj @@ -1,7 +1,7 @@ - net6.0 + net7.0 diff --git a/tests/BertBaseTokenizerUncasedShould.cs b/tests/BertBaseTokenizerUncasedShould.cs index 9439680..48939e2 100644 --- a/tests/BertBaseTokenizerUncasedShould.cs +++ b/tests/BertBaseTokenizerUncasedShould.cs @@ -25,7 +25,20 @@ public void Tokenize_sentence() Assert.Equal(("love", 2293, 0), tokens[2]); Assert.Equal(("you", 2017, 0), tokens[3]); Assert.Equal(("[SEP]", 102, 0), tokens[4]); + } + + [Fact] + public void Tokenize_text_with_linux_line_endings() + { + var sentence = "Linux\nline\nendings"; + var tokens = _tokenizer.Tokenize(sentence); + Assert.Equal(5, tokens.Count); + Assert.Equal(("[CLS]", 101, 0), tokens[0]); + Assert.Equal(("linux", 11603, 0), tokens[1]); + Assert.Equal(("line", 2240, 0), tokens[2]); + Assert.Equal(("endings", 21306, 0), tokens[3]); + Assert.Equal(("[SEP]", 102, 0), tokens[4]); } [Fact] @@ -61,7 +74,7 @@ public void Encode_sentence() } [Fact] - public void Unokenize_sentence() + public void Untokenize_sentence() { var tokens = new List(){ "she", "##s" };