From 4ed1b82841d6b553fef7e6863b99061703cacb7b Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Tue, 13 Oct 2020 16:49:45 -0700 Subject: [PATCH 1/5] fix for issue --- .../Text/WordTokenizing.cs | 9 ++++--- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 27 ++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs index 1eac17ccaa..53fae2c6e1 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs @@ -404,6 +404,7 @@ public void SaveAsOnnx(OnnxContext ctx) string opType; while (columns.MoveNext()) { + opType = "Tokenizer"; var column = columns.Current; var intermediateVar = ctx.AddIntermediateVariable(_type, "TokenizerOutput", true); @@ -415,10 +416,10 @@ public void SaveAsOnnx(OnnxContext ctx) string[] separators = column.SeparatorsArray.Select(c => c.ToString()).ToArray(); tokenizerNode.AddAttribute("separators", separators); - opType = "Squeeze"; - var squeezeOutput = ctx.AddIntermediateVariable(_type, column.Name); - var squeezeNode = ctx.CreateNode(opType, intermediateVar, squeezeOutput, ctx.GetNodeName(opType), ""); - squeezeNode.AddAttribute("axes", new long[] { 1 }); + opType = "Reshape"; + var shape = ctx.AddInitializer(new long[] { 1, -1 }, new long[] { 2 }, "Shape"); + var reshapeOutput = ctx.AddIntermediateVariable(new VectorDataViewType(TextDataViewType.Instance, 1), column.Name); + var reshapeNode = ctx.CreateNode(opType, new[] { intermediateVar, shape }, new[] { reshapeOutput }, ctx.GetNodeName(opType), ""); } } } diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 69dbbe57e5..8bf75f405d 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1310,22 +1310,25 @@ public void NgramOnnxConversionTest( IEstimator[] pipelines = { mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) - .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("Tokens", "Text") .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting) + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting), + + mlContext.Transforms.Text.TokenizeIntoWords("Tokens0", "Text") + .Append(mlContext.Transforms.Text.ProduceWordBags("Tokens", "Tokens0")) }; for (int i = 0; i < pipelines.Length; i++) @@ -1346,7 +1349,7 @@ public void NgramOnnxConversionTest( var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxFilePath, gpuDeviceId: _gpuDeviceId, fallbackToCpu: _fallbackToCpu); var onnxTransformer = onnxEstimator.Fit(dataView); var onnxResult = onnxTransformer.Transform(dataView); - var columnName = i == pipelines.Length - 1 ? "Tokens" : "NGrams"; + var columnName = i >= pipelines.Length - 2 ? "Tokens" : "NGrams"; CompareResults(columnName, columnName, transformedData, onnxResult, 3); VBuffer> mlNetSlots = default; From 40c9b39d57be1c4ad5595c5177fa1c696740fe94 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Tue, 13 Oct 2020 16:58:07 -0700 Subject: [PATCH 2/5] fix documentation --- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 7b4b554b7c..9ae8888791 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -334,7 +334,7 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa => new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords); /// - /// Create a , which maps the column specified in + /// Create a , which maps the column specified in /// to a vector of n-gram counts in a new column named . /// /// @@ -363,7 +363,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// - /// Create a , which maps the multiple columns specified in + /// Create a , which maps the multiple columns specified in /// to a vector of n-gram counts in a new column named . /// /// From 8571d4925b4f4b0794501ef6301aac5805e9b437 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Wed, 14 Oct 2020 09:24:12 -0700 Subject: [PATCH 3/5] aligning test --- .../Text/WordTokenizing.cs | 1 - test/Microsoft.ML.Tests/OnnxConversionTest.cs | 19 +++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs index 53fae2c6e1..3af7a1e471 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs @@ -404,7 +404,6 @@ public void SaveAsOnnx(OnnxContext ctx) string opType; while (columns.MoveNext()) { - opType = "Tokenizer"; var column = columns.Current; var intermediateVar = ctx.AddIntermediateVariable(_type, "TokenizerOutput", true); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 8bf75f405d..136aa78132 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1312,20 +1312,19 @@ public void NgramOnnxConversionTest( mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }) .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("Tokens", "Text") .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), - + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting), + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting), mlContext.Transforms.Text.TokenizeIntoWords("Tokens0", "Text") .Append(mlContext.Transforms.Text.ProduceWordBags("Tokens", "Tokens0")) From b4ebafddb2b52a06ff496690f86ea6057df2a9ea Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Wed, 14 Oct 2020 09:25:37 -0700 Subject: [PATCH 4/5] adding back line --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 136aa78132..68961a0899 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1321,6 +1321,7 @@ public void NgramOnnxConversionTest( ngramLength: ngramLength, useAllLengths: useAllLength, weighting: weighting)), + mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text", ngramLength: ngramLength, useAllLengths: useAllLength, From 4497de8679c6ff6521e6439f2949aee3613cc174 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Wed, 14 Oct 2020 19:36:31 -0700 Subject: [PATCH 5/5] aligning fix --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 68961a0899..9f97f21ad4 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1310,11 +1310,11 @@ public void NgramOnnxConversionTest( IEstimator[] pipelines = { mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) - .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("Tokens", "Text") .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens",