Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add V1 Scenario tests for data transformation #2803

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions test/Microsoft.ML.Functional.Tests/DataTransformation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using Microsoft.ML.Functional.Tests.Datasets;
using Microsoft.ML.RunTests;
using Microsoft.ML.TestFramework;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Text;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.Functional.Tests
{
public class DataTransformation : BaseTestClass
{
public DataTransformation(ITestOutputHelper output) : base(output)
{
}

/// <summary>
/// Extensibility: Add a new column that is a function of other columns.
/// </summary>
[Fact]
void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns()
{
// Concurrency must be 1 to assure that the mapping is done sequentially.
var mlContext = new MLContext(seed: 1, conc: 1);

// Load the Iris dataset
var data = mlContext.Data.LoadFromTextFile<Iris>(
GetDataPath(TestDatasets.iris.trainFilename),
hasHeader: TestDatasets.iris.fileHasHeader,
separatorChar: TestDatasets.iris.fileSeparator);

// Subsample it down to the first 10 rows.
int numSamples = 10;
data = mlContext.Data.TakeRows(data, numSamples);

// Create a stand-alone function to produce a random number.
float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength)
{
var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength);
var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength);
return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude));
}

// Create a function that generates a column.
Action<Iris, IrisWithOneExtraColumn> generateGroupId = (input, output) =>
{
output.Label = input.Label;
output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth);
output.PetalLength = input.PetalLength;
output.PetalWidth = input.PetalWidth;
output.SepalLength = input.SepalLength;
output.SepalWidth = input.SepalWidth;
};

// Create a pipeline to execute the custom function.
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);

// Transform the data.
var transformedData = pipeline.Fit(data).Transform(data);

// Verify that the column has the correct data.
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithOneExtraColumn>(transformedData, reuseRowObject: true);
foreach (var row in transformedRows)
{
var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth);
Assert.Equal(cosineDistance, row.Float1);
}
}

/// <summary>
/// Extensibility: Add multiple new columns.
/// </summary>
[Fact]
void ExtensibilityAddingTwoColumns()
{
// Concurrency must be 1 to assure that the mapping is done sequentially.
var mlContext = new MLContext(seed: 1, conc: 1);

// Load the Iris dataset
var data = mlContext.Data.LoadFromTextFile<Iris>(
GetDataPath(TestDatasets.iris.trainFilename),
hasHeader: TestDatasets.iris.fileHasHeader,
separatorChar: TestDatasets.iris.fileSeparator);

// Subsample it down to the first 10 rows.
int numSamples = 10;
data = mlContext.Data.TakeRows(data, numSamples);

// Create a function that generates a column.
Action<Iris, IrisWithTwoExtraColumns> generateGroupId = (input, output) =>
{
output.Label = input.Label;
output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
output.PetalLength = input.PetalLength;
output.PetalWidth = input.PetalWidth;
output.SepalLength = input.SepalLength;
output.SepalWidth = input.SepalWidth;
};

// Create a pipeline to execute the custom function.
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);

// Transform the data.
var transformedData = pipeline.Fit(data).Transform(data);

// Verify that the column has the correct data.
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithTwoExtraColumns>(transformedData, reuseRowObject: true);
foreach (var row in transformedRows)
{
var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
Assert.Equal(randomNumber1, row.Float1);
Assert.Equal(randomNumber2, row.Float2);
}
}

/// <summary>
/// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
/// </summary>
[Fact]
void ExtensibilityModifyTextFeaturization()
{
// Concurrency must be 1 to assure that the mapping is done sequentially.
var mlContext = new MLContext(seed: 1, conc: 1);

var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(GetDataPath(TestDatasets.Sentiment.trainFilename),
hasHeader: TestDatasets.Sentiment.fileHasHeader,
separatorChar: TestDatasets.Sentiment.fileSeparator);

// Create a training pipeline.
// TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" },
new TextFeaturizingEstimator.Options
{
UseCharExtractor = true,
UseWordExtractor = true,
VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1
})
.AppendCacheCheckpoint(mlContext)
.Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
new SdcaBinaryTrainer.Options { NumThreads = 1 }));

// Train the model.
var model = pipeline.Fit(data);

// Evaluate the model.
var scoredData = model.Transform(data);
var metrics = mlContext.BinaryClassification.Evaluate(scoredData);

// Check that the metrics returned are valid.
Common.AssertMetrics(metrics);
}

/// <summary>
/// Extensibility: Apply a normalizer to columns in the dataset.
/// </summary>
Copy link
Contributor

@artidoro artidoro Mar 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you change summary tag to match the test? #Resolved

[Fact]
void ExtensibilityNormalizeColumns()
{
// Concurrency must be 1 to assure that the mapping is done sequentially.
var mlContext = new MLContext(seed: 1, conc: 1);

// Load the Iris dataset.
var data = mlContext.Data.LoadFromTextFile<Iris>(
GetDataPath(TestDatasets.iris.trainFilename),
hasHeader: TestDatasets.iris.fileHasHeader,
separatorChar: TestDatasets.iris.fileSeparator);

// Compose the transformation.
var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
.Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax));

// Transform the data.
var transformedData = pipeline.Fit(data).Transform(data);

// Validate that the data was normalized to between -1 and 1.
var dataEnumerator = mlContext.Data.CreateEnumerable<FeatureColumn>(transformedData, true);
foreach (var row in dataEnumerator)
// Verify per-slot normalization.
for (int i = 0; i < row.Features.Length; i++)
Assert.InRange(row.Features[i], -1, 1);
}

private float GetRandomNumber(float number)
{
var seed = (int)(10 * number);
var rng = new Random(seed);
return (float)rng.NextDouble();
}
}
}
14 changes: 14 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace Microsoft.ML.Functional.Tests.Datasets
{
/// <summary>
/// A class to hold a feature column.
/// </summary>
internal sealed class FeatureColumn
{
public float[] Features { get; set; }
}
Copy link
Member

@sfilipi sfilipi Mar 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe move it to the other file, the Iris. #ByDesign

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is designed to be generic for tests against any features columns, not just the iris, so I'll keep it in its own file for now, similar to `FeatureContributionOutput'.


In reply to: 261830507 [](ancestors = 261830507)

}
27 changes: 27 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,31 @@ internal sealed class IrisWithGroup
public float PetalLength { get; set; }
public float PetalWidth { get; set; }
}

/// <summary>
/// A class for the Iris dataset with an extra float column.
/// </summary>
internal sealed class IrisWithOneExtraColumn
{
public float Label { get; set; }
public float SepalLength { get; set; }
public float SepalWidth { get; set; }
public float PetalLength { get; set; }
public float PetalWidth { get; set; }
public float Float1 { get; set; }
}

/// <summary>
/// A class for the Iris dataset with two extra float columns.
/// </summary>
internal sealed class IrisWithTwoExtraColumns
{
public float Label { get; set; }
public float SepalLength { get; set; }
public float SepalWidth { get; set; }
public float PetalLength { get; set; }
public float PetalWidth { get; set; }
public float Float1 { get; set; }
public float Float2 { get; set; }
}
}
59 changes: 0 additions & 59 deletions test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs

This file was deleted.