Skip to content

Commit

Permalink
multicolumn mapping for some estimators
Browse files Browse the repository at this point in the history
  • Loading branch information
artidoro committed Mar 22, 2019
1 parent 49403ab commit 0896590
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 31 deletions.
73 changes: 73 additions & 0 deletions src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,20 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) });

/// <summary>
/// Changes column type of the input columns.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputKind">The expected kind of the output column.</param>
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns,
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
{
var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray();
return new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// Changes column type of the input column.
/// </summary>
Expand All @@ -89,6 +103,20 @@ internal static TypeConvertingEstimator ConvertType(this TransformsCatalog.Conve
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null)
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);

/// <summary>
/// Convert the key types back to their original values.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[KeyToValueMappingEstimator](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs)]
/// ]]></format>
/// </example>
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns)
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());

/// <summary>
/// Convert the key types (name of the column specified in the first item of the tuple) back to their original values
/// (named as specified in the second item of the tuple).
Expand Down Expand Up @@ -127,6 +155,21 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector);

/// <summary>
/// Maps columns of key types or key values into columns of floating point vectors.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputCountVector">Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them.
/// This is only relevant when the input column is a vector of keys.</param>
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
{
var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray();
return new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);

}

/// <summary>
/// Converts value types into <see cref="KeyType"/>.
/// </summary>
Expand Down Expand Up @@ -157,6 +200,36 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog),
new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData);

/// <summary>
/// Converts value types into <see cref="KeyType"/>.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="maximumNumberOfKeys">Maximum number of keys to keep per column when auto-training.</param>
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <param name="addKeyValueAnnotationsAsText">Whether key value annotations should be text, regardless of the actual input type.</param>
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined
/// from the input data upon fitting.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[ValueToKey](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs)]
/// ]]>
/// </format>
/// </example>
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns,
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText,
IDataView keyData = null)
{
var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray();
return new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
}

/// <summary>
/// Converts value types into <see cref="KeyType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
/// </summary>
Expand Down
26 changes: 26 additions & 0 deletions src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,32 @@

namespace Microsoft.ML
{
/// <summary>
/// Specifies input and output column names for a transformation.
/// </summary>
public sealed class InputOutputColumnPair
{
/// <summary>
/// Name of the column to transform. If set to <see langword="null"/>, the value of the <see cref="OutputColumnName"/> will be used as source.
/// </summary>
public readonly string InputColumnName;
/// <summary>
/// Name of the column resulting from the transformation of <see cref="InputColumnName"/>.
/// </summary>
public readonly string OutputColumnName;

/// <summary>
/// Specifies input and output column names for a transformation.
/// </summary>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
public InputOutputColumnPair(string outputColumnName, string inputColumnName = null)
{
InputColumnName = inputColumnName;
OutputColumnName = outputColumnName;
}
}

/// <summary>
/// Specifies input and output column names for a transformation.
/// </summary>
Expand Down
54 changes: 54 additions & 0 deletions src/Microsoft.ML.Transforms/CategoricalCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

Expand Down Expand Up @@ -40,6 +41,34 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
=> new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);

/// <summary>
/// Convert text columns into one-hot encoded vectors.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
/// ]]></format>
/// </example>
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
InputOutputColumnPair[] columns,
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
IDataView keyData = null)
{
var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray();
return new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
}

/// <summary>
/// Convert several text column into one-hot encoded vectors.
/// </summary>
Expand Down Expand Up @@ -88,6 +117,31 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
=> new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });

/// <summary>
/// Convert text columns into hash-based one-hot encoded vector columns.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputKind">The conversion mode.</param>
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
/// <param name="seed">Hashing seed.</param>
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
InputOutputColumnPair[] columns,
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,
int numberOfBits = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
uint seed = OneHotHashEncodingEstimator.Defaults.Seed,
bool useOrderedHashing = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing,
int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts)
{
var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray();
return new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// Convert several text column into hash-based one-hot encoded vectors.
/// </summary>
Expand Down
52 changes: 41 additions & 11 deletions src/Microsoft.ML.Transforms/ExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,14 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Microsoft.ML
{
public static class ExtensionsCatalog
{
/// <summary>
/// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column
/// is true if the value in the input column is missing.
/// </summary>
/// <param name="catalog">The transform extensions' catalog.</param>
/// <param name="columns">The names of the input columns of the transformation and the corresponding names for the output columns.</param>
[BestFriend]
internal static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog,
params ColumnOptions[] columns)
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns));

/// <summary>
/// Creates a new output column, or replaces the source with a new column
/// (depending on whether the <paramref name="inputColumnName"/> is given a value, or left to null)
Expand All @@ -41,6 +31,21 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
string inputColumnName = null)
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);

/// <summary>
/// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column
/// is true if the value in the input column is missing.
/// </summary>
/// <param name="catalog">The transform extensions' catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs)]
/// ]]></format>
/// </example>
public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns)
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());

/// <summary>
/// Creates a new output column, or replaces the source with a new column
/// (depending on whether the <paramref name="outputColumnName"/> is given a value, or left to null)
Expand Down Expand Up @@ -69,6 +74,31 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform
bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot)
=> new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new MissingValueReplacingEstimator.ColumnOptions(outputColumnName, inputColumnName, replacementMode, imputeBySlot) });

/// <summary>
/// Creates a new output column, identical to the input column for everything but the missing values.
/// The missing values of the input column, in this new column are replaced with <see cref="MissingValueReplacingEstimator.ReplacementMode.DefaultValue"/>.
/// </summary>
/// <param name="catalog">The transform extensions' catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="replacementMode">The type of replacement to use as specified in <see cref="MissingValueReplacingEstimator.ReplacementMode"/></param>
/// <param name="imputeBySlot">If true, per-slot imputation of replacement is performed.
/// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors,
/// where imputation is always for the entire column.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs)]
/// ]]></format>
/// </example>
public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog,
InputOutputColumnPair[] columns,
MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode,
bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot)
{
var columnOptions = columns.Select(x => new MissingValueReplacingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, replacementMode, imputeBySlot)).ToArray();
return new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// Creates a new output column, identical to the input column for everything but the missing values.
/// The missing values of the input column, in this new column are replaced with <see cref="MissingValueReplacingEstimator.ReplacementMode.DefaultValue"/>.
Expand Down
Loading

0 comments on commit 0896590

Please sign in to comment.