Skip to content

Commit

Permalink
Add an example of random PCA using in-memory data structure (#2780)
Browse files Browse the repository at this point in the history
  • Loading branch information
wschin authored Feb 28, 2019
1 parent fbf282d commit 6e9023f
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
{
public static class RandomizedPcaSample
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);

// Train the anomaly detector.
var model = pipeline.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// Let's go through all predictions.
for (int i = 0; i < samples.Count; ++i)
{
// The i-th example's prediction result.
var result = results[i];

// The i-th example's feature vector in text format.
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
i, featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
i, featuresInText, result.Score);
}
// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
}

// Example with 3 feature values. A training data set is a collection of such examples.
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
{
public static class RandomizedPcaSampleWithOptions
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

var options = new ML.Trainers.RandomizedPcaTrainer.Options()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
Seed = 10,
};

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);

// Train the anomaly detector.
var model = pipeline.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// Let's go through all predictions.
for (int i = 0; i < samples.Count; ++i)
{
// The i-th example's prediction result.
var result = results[i];

// The i-th example's feature vector in text format.
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
i, featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
i, featuresInText, result.Score);
}
// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
}

// Example with 3 feature values. A training data set is a collection of such examples.
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using static Microsoft.ML.SamplesUtils.DatasetUtils;

namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using static Microsoft.ML.SamplesUtils.DatasetUtils;

Expand Down
12 changes: 12 additions & 0 deletions src/Microsoft.ML.PCA/PCACatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
/// <param name="center">If enabled, data is centered to be zero mean.</param>
/// <param name="seed">The seed for random number generation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs)]
/// ]]></format>
/// </example>
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog,
string featureColumnName = DefaultColumnNames.Features,
string exampleWeightColumnName = null,
Expand All @@ -65,6 +71,12 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
/// </summary>
/// <param name="catalog">The anomaly detection catalog trainer object.</param>
/// <param name="options">Advanced options to the algorithm.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs)]
/// ]]></format>
/// </example>
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options)
{
Contracts.CheckValue(catalog, nameof(catalog));
Expand Down
92 changes: 92 additions & 0 deletions test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.RunTests;
Expand Down Expand Up @@ -48,6 +50,96 @@ public void NoAnomalyTest()
Assert.Throws<ArgumentOutOfRangeException>(() => ML.AnomalyDetection.Evaluate(transformedData));
}

[Fact]
public static void RandomizedPcaInMemory()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);

// Test the first detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);

// Object required in the creation of another detector.
var options = new Trainers.RandomizedPcaTrainer.Options()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
Center = false
};

// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);

// Test the second detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
}

/// <summary>
/// Example with 3 feature values used in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
/// </summary>
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

/// <summary>
/// Class used to capture prediction of <see cref="DataPoint"/> in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
/// </summary>
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}

/// <summary>
/// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
/// </summary>
private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
{
var samples = new List<DataPoint>()
{
new DataPoint(){ Features= new float[3] {1, 0, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {1, 2, 3} },
new DataPoint(){ Features= new float[3] {0, 1, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Train the anomaly detector.
var model = trainer.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// First 5 examples are inliers.
for (int i = 0; i < 5; ++i)
{
// Inlier should be predicted as true.
Assert.True(results[i].PredictedLabel);
// Higher score means closer to inlier.
Assert.InRange(results[i].Score, 0.3, 1);
}

// Last example is outlier. Note that outlier should be predicted as false.
Assert.False(results[5].PredictedLabel);
Assert.InRange(results[5].Score, 0, 0.3);
}

private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
{
var loader = ML.Data.CreateTextLoader(new[]
Expand Down

0 comments on commit 6e9023f

Please sign in to comment.