diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs
new file mode 100644
index 0000000000..5c2727a7d0
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs
@@ -0,0 +1,198 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using Microsoft.ML.Functional.Tests.Datasets;
+using Microsoft.ML.RunTests;
+using Microsoft.ML.TestFramework;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Microsoft.ML.Transforms.Text;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Functional.Tests
+{
+ public class DataTransformation : BaseTestClass
+ {
+ public DataTransformation(ITestOutputHelper output) : base(output)
+ {
+ }
+
+ ///
+ /// Extensibility: Add a new column that is a function of other columns.
+ ///
+ [Fact]
+ void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ // Load the Iris dataset
+ var data = mlContext.Data.LoadFromTextFile(
+ GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Subsample it down to the first 10 rows.
+ int numSamples = 10;
+ data = mlContext.Data.TakeRows(data, numSamples);
+
+ // Create a stand-alone function to produce a random number.
+ float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength)
+ {
+ var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength);
+ var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength);
+ return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude));
+ }
+
+ // Create a function that generates a column.
+ Action generateGroupId = (input, output) =>
+ {
+ output.Label = input.Label;
+ output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth);
+ output.PetalLength = input.PetalLength;
+ output.PetalWidth = input.PetalWidth;
+ output.SepalLength = input.SepalLength;
+ output.SepalWidth = input.SepalWidth;
+ };
+
+ // Create a pipeline to execute the custom function.
+ var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
+
+ // Transform the data.
+ var transformedData = pipeline.Fit(data).Transform(data);
+
+ // Verify that the column has the correct data.
+ var transformedRows = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true);
+ foreach (var row in transformedRows)
+ {
+ var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth);
+ Assert.Equal(cosineDistance, row.Float1);
+ }
+ }
+
+ ///
+ /// Extensibility: Add multiple new columns.
+ ///
+ [Fact]
+ void ExtensibilityAddingTwoColumns()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ // Load the Iris dataset
+ var data = mlContext.Data.LoadFromTextFile(
+ GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Subsample it down to the first 10 rows.
+ int numSamples = 10;
+ data = mlContext.Data.TakeRows(data, numSamples);
+
+ // Create a function that generates a column.
+ Action generateGroupId = (input, output) =>
+ {
+ output.Label = input.Label;
+ output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
+ output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
+ output.PetalLength = input.PetalLength;
+ output.PetalWidth = input.PetalWidth;
+ output.SepalLength = input.SepalLength;
+ output.SepalWidth = input.SepalWidth;
+ };
+
+ // Create a pipeline to execute the custom function.
+ var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
+
+ // Transform the data.
+ var transformedData = pipeline.Fit(data).Transform(data);
+
+ // Verify that the column has the correct data.
+ var transformedRows = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true);
+ foreach (var row in transformedRows)
+ {
+ var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
+ var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
+ Assert.Equal(randomNumber1, row.Float1);
+ Assert.Equal(randomNumber2, row.Float2);
+ }
+ }
+
+ ///
+ /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
+ ///
+ [Fact]
+ void ExtensibilityModifyTextFeaturization()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename),
+ hasHeader: TestDatasets.Sentiment.fileHasHeader,
+ separatorChar: TestDatasets.Sentiment.fileSeparator);
+
+ // Create a training pipeline.
+ // TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
+ var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" },
+ new TextFeaturizingEstimator.Options
+ {
+ UseCharExtractor = true,
+ UseWordExtractor = true,
+ VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1
+ })
+ .AppendCacheCheckpoint(mlContext)
+ .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
+ new SdcaBinaryTrainer.Options { NumThreads = 1 }));
+
+ // Train the model.
+ var model = pipeline.Fit(data);
+
+ // Evaluate the model.
+ var scoredData = model.Transform(data);
+ var metrics = mlContext.BinaryClassification.Evaluate(scoredData);
+
+ // Check that the metrics returned are valid.
+ Common.AssertMetrics(metrics);
+ }
+
+ ///
+ /// Extensibility: Apply a normalizer to columns in the dataset.
+ ///
+ [Fact]
+ void ExtensibilityNormalizeColumns()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1, conc: 1);
+
+ // Load the Iris dataset.
+ var data = mlContext.Data.LoadFromTextFile(
+ GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Compose the transformation.
+ var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
+ .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax));
+
+ // Transform the data.
+ var transformedData = pipeline.Fit(data).Transform(data);
+
+ // Validate that the data was normalized to between -1 and 1.
+ var dataEnumerator = mlContext.Data.CreateEnumerable(transformedData, true);
+ foreach (var row in dataEnumerator)
+ // Verify per-slot normalization.
+ for (int i = 0; i < row.Features.Length; i++)
+ Assert.InRange(row.Features[i], -1, 1);
+ }
+
+ private float GetRandomNumber(float number)
+ {
+ var seed = (int)(10 * number);
+ var rng = new Random(seed);
+ return (float)rng.NextDouble();
+ }
+ }
+}
\ No newline at end of file
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs b/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs
new file mode 100644
index 0000000000..090ad23646
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs
@@ -0,0 +1,14 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Functional.Tests.Datasets
+{
+ ///
+ /// A class to hold a feature column.
+ ///
+ internal sealed class FeatureColumn
+ {
+ public float[] Features { get; set; }
+ }
+}
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs
index 9f402235ec..55868e5448 100644
--- a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs
+++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs
@@ -75,4 +75,31 @@ internal sealed class IrisWithGroup
public float PetalLength { get; set; }
public float PetalWidth { get; set; }
}
+
+ ///
+ /// A class for the Iris dataset with an extra float column.
+ ///
+ internal sealed class IrisWithOneExtraColumn
+ {
+ public float Label { get; set; }
+ public float SepalLength { get; set; }
+ public float SepalWidth { get; set; }
+ public float PetalLength { get; set; }
+ public float PetalWidth { get; set; }
+ public float Float1 { get; set; }
+ }
+
+ ///
+ /// A class for the Iris dataset with two extra float columns.
+ ///
+ internal sealed class IrisWithTwoExtraColumns
+ {
+ public float Label { get; set; }
+ public float SepalLength { get; set; }
+ public float SepalWidth { get; set; }
+ public float PetalLength { get; set; }
+ public float PetalWidth { get; set; }
+ public float Float1 { get; set; }
+ public float Float2 { get; set; }
+ }
}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs
deleted file mode 100644
index 47e1fc4034..0000000000
--- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Linq;
-using Microsoft.ML.Data;
-using Microsoft.ML.RunTests;
-using Microsoft.ML.Trainers;
-using Microsoft.ML.Transforms;
-using Xunit;
-
-namespace Microsoft.ML.Tests.Scenarios.Api
-{
- public partial class ApiScenariosTests
- {
- ///
- /// Extensibility: We can't possibly write every conceivable transform and should not try.
- /// It should somehow be possible for a user to inject custom code to, say, transform data.
- /// This might have a much steeper learning curve than the other usages (which merely involve
- /// usage of already established components), but should still be possible.
- ///
- [Fact]
- void Extensibility()
- {
- var dataPath = GetDataPath(TestDatasets.irisData.trainFilename);
-
- var ml = new MLContext();
- var data = ml.Data.CreateTextLoader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',')
- .Load(dataPath);
-
- Action action = (i, j) =>
- {
- j.Label = i.Label;
- j.PetalLength = i.SepalLength > 3 ? i.PetalLength : i.SepalLength;
- j.PetalWidth = i.PetalWidth;
- j.SepalLength = i.SepalLength;
- j.SepalWidth = i.SepalWidth;
- };
- var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
- .Append(new CustomMappingEstimator(ml, action, null), TransformerScope.TrainTest)
- .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest)
- .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(
- new SdcaMultiClassTrainer.Options { MaxIterations = 100, Shuffle = true, NumThreads = 1 }))
- .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel"));
-
- var model = pipeline.Fit(data).GetModelFor(TransformerScope.Scoring);
- var engine = model.CreatePredictionEngine(ml);
-
- var testLoader = ml.Data.LoadFromTextFile(dataPath, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',');
- var testData = ml.Data.CreateEnumerable(testLoader, false);
- foreach (var input in testData.Take(20))
- {
- var prediction = engine.Predict(input);
- Assert.True(prediction.PredictedLabel == input.Label);
- }
- }
- }
-}