diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs new file mode 100644 index 0000000000..5c2727a7d0 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs @@ -0,0 +1,198 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Text; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class DataTransformation : BaseTestClass + { + public DataTransformation(ITestOutputHelper output) : base(output) + { + } + + /// + /// Extensibility: Add a new column that is a function of other columns. + /// + [Fact] + void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Iris dataset + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Subsample it down to the first 10 rows. + int numSamples = 10; + data = mlContext.Data.TakeRows(data, numSamples); + + // Create a stand-alone function to produce a random number. + float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength) + { + var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength); + var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength); + return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude)); + } + + // Create a function that generates a column. + Action generateGroupId = (input, output) => + { + output.Label = input.Label; + output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth); + output.PetalLength = input.PetalLength; + output.PetalWidth = input.PetalWidth; + output.SepalLength = input.SepalLength; + output.SepalWidth = input.SepalWidth; + }; + + // Create a pipeline to execute the custom function. + var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null); + + // Transform the data. + var transformedData = pipeline.Fit(data).Transform(data); + + // Verify that the column has the correct data. + var transformedRows = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + foreach (var row in transformedRows) + { + var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth); + Assert.Equal(cosineDistance, row.Float1); + } + } + + /// + /// Extensibility: Add multiple new columns. + /// + [Fact] + void ExtensibilityAddingTwoColumns() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Iris dataset + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Subsample it down to the first 10 rows. + int numSamples = 10; + data = mlContext.Data.TakeRows(data, numSamples); + + // Create a function that generates a column. + Action generateGroupId = (input, output) => + { + output.Label = input.Label; + output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth); + output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth); + output.PetalLength = input.PetalLength; + output.PetalWidth = input.PetalWidth; + output.SepalLength = input.SepalLength; + output.SepalWidth = input.SepalWidth; + }; + + // Create a pipeline to execute the custom function. + var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null); + + // Transform the data. + var transformedData = pipeline.Fit(data).Transform(data); + + // Verify that the column has the correct data. + var transformedRows = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + foreach (var row in transformedRows) + { + var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth); + var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth); + Assert.Equal(randomNumber1, row.Float1); + Assert.Equal(randomNumber2, row.Float2); + } + } + + /// + /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization. + /// + [Fact] + void ExtensibilityModifyTextFeaturization() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + // TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" }, + new TextFeaturizingEstimator.Options + { + UseCharExtractor = true, + UseWordExtractor = true, + VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1 + }) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent( + new SdcaBinaryTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Extensibility: Apply a normalizer to columns in the dataset. + /// + [Fact] + void ExtensibilityNormalizeColumns() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Iris dataset. + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Compose the transformation. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax)); + + // Transform the data. + var transformedData = pipeline.Fit(data).Transform(data); + + // Validate that the data was normalized to between -1 and 1. + var dataEnumerator = mlContext.Data.CreateEnumerable(transformedData, true); + foreach (var row in dataEnumerator) + // Verify per-slot normalization. + for (int i = 0; i < row.Features.Length; i++) + Assert.InRange(row.Features[i], -1, 1); + } + + private float GetRandomNumber(float number) + { + var seed = (int)(10 * number); + var rng = new Random(seed); + return (float)rng.NextDouble(); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs b/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs new file mode 100644 index 0000000000..090ad23646 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class to hold a feature column. + /// + internal sealed class FeatureColumn + { + public float[] Features { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs index 9f402235ec..55868e5448 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs @@ -75,4 +75,31 @@ internal sealed class IrisWithGroup public float PetalLength { get; set; } public float PetalWidth { get; set; } } + + /// + /// A class for the Iris dataset with an extra float column. + /// + internal sealed class IrisWithOneExtraColumn + { + public float Label { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } + public float Float1 { get; set; } + } + + /// + /// A class for the Iris dataset with two extra float columns. + /// + internal sealed class IrisWithTwoExtraColumns + { + public float Label { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } + public float Float1 { get; set; } + public float Float2 { get; set; } + } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs deleted file mode 100644 index 47e1fc4034..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Microsoft.ML.Transforms; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Extensibility: We can't possibly write every conceivable transform and should not try. - /// It should somehow be possible for a user to inject custom code to, say, transform data. - /// This might have a much steeper learning curve than the other usages (which merely involve - /// usage of already established components), but should still be possible. - /// - [Fact] - void Extensibility() - { - var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); - - var ml = new MLContext(); - var data = ml.Data.CreateTextLoader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') - .Load(dataPath); - - Action action = (i, j) => - { - j.Label = i.Label; - j.PetalLength = i.SepalLength > 3 ? i.PetalLength : i.SepalLength; - j.PetalWidth = i.PetalWidth; - j.SepalLength = i.SepalLength; - j.SepalWidth = i.SepalWidth; - }; - var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new CustomMappingEstimator(ml, action, null), TransformerScope.TrainTest) - .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) - .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaMultiClassTrainer.Options { MaxIterations = 100, Shuffle = true, NumThreads = 1 })) - .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); - - var model = pipeline.Fit(data).GetModelFor(TransformerScope.Scoring); - var engine = model.CreatePredictionEngine(ml); - - var testLoader = ml.Data.LoadFromTextFile(dataPath, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ','); - var testData = ml.Data.CreateEnumerable(testLoader, false); - foreach (var input in testData.Take(20)) - { - var prediction = engine.Predict(input); - Assert.True(prediction.PredictedLabel == input.Label); - } - } - } -}