Skip to content

Commit

Permalink
Remove unused param
Browse files Browse the repository at this point in the history
Docs
maxRows
More unit tests
Fixed ArrowStringDataFrameColumn construction in the unit test
  • Loading branch information
Prashanth Govindarajan committed Mar 19, 2021
1 parent 1ce802b commit 1203495
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 64 deletions.
29 changes: 8 additions & 21 deletions src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public partial class DataFrame : IDataView
bool IDataView.CanShuffle => false;

private DataViewSchema _schema;
internal DataViewSchema DataViewSchema
private DataViewSchema DataViewSchema
{
get
{
Expand Down Expand Up @@ -70,29 +70,22 @@ private sealed class RowCursor : DataViewRowCursor
private bool _disposed;
private long _position;
private readonly DataFrame _dataFrame;
private readonly List<Delegate> _getters;
private Dictionary<int, int> _columnIndexToGetterIndex;
private readonly Delegate[] _getters;

public RowCursor(DataFrame dataFrame, bool[] activeColumns)
{
Debug.Assert(dataFrame != null);
Debug.Assert(activeColumns != null);

_columnIndexToGetterIndex = new Dictionary<int, int>();
_position = -1;
_dataFrame = dataFrame;
_getters = new List<Delegate>();
for (int i = 0; i < Schema.Count; i++)
_getters = new Delegate[Schema.Count];
for (int i = 0; i < _getters.Length; i++)
{
if (!activeColumns[i])
{
continue;
}

Delegate getter = CreateGetterDelegate(i);
_getters.Add(getter);
Debug.Assert(getter != null);
_columnIndexToGetterIndex[i] = _getters.Count - 1;
_getters[i] = CreateGetterDelegate(i);
Debug.Assert(_getters[i] != null);
}
}

Expand All @@ -103,15 +96,11 @@ public RowCursor(DataFrame dataFrame, bool[] activeColumns)
protected override void Dispose(bool disposing)
{
if (_disposed)
{
return;
}

if (disposing)
{
_position = -1;
}

_disposed = true;
base.Dispose(disposing);
}
Expand All @@ -127,7 +116,7 @@ public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column colu
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));

return (ValueGetter<TValue>)_getters[_columnIndexToGetterIndex[column.Index]];
return (ValueGetter<TValue>)_getters[column.Index];
}

public override ValueGetter<DataViewRowId> GetIdGetter()
Expand All @@ -137,15 +126,13 @@ public override ValueGetter<DataViewRowId> GetIdGetter()

public override bool IsColumnActive(DataViewSchema.Column column)
{
return _getters[_columnIndexToGetterIndex[column.Index]] != null;
return _getters[column.Index] != null;
}

public override bool MoveNext()
{
if (_disposed)
{
return false;
}
_position++;
return _position < _dataFrame.Rows.Count;
}
Expand Down
5 changes: 2 additions & 3 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,14 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException();
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException();

/// <summary>
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> to return the ValueGetter for.</param>
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
Expand Down
72 changes: 45 additions & 27 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,113 +13,131 @@ public static class IDataViewExtensions
{
private const int defaultMaxRows = 100;

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> from this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Defaults to 100. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with <paramref name="maxRows"/>.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
{
return ToDataFrame(dataView, maxRows, null);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first 100 rows of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and 100 rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first <paramref name="maxRows"/> of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and <paramref name="maxRows"/> rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);
List<DataFrameColumn> dataFrameColumns = new List<DataFrameColumn>(schema.Count);
maxRows = maxRows == -1 ? long.MaxValue : maxRows;

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column column in schema)
List<DataViewSchema.Column> activeDataViewColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column dataViewColumn in schema)
{
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name)))
{
continue;
}

activeColumns.Add(column);
DataViewType type = column.Type;
activeDataViewColumns.Add(dataViewColumn);
DataViewType type = dataViewColumn.Type;
if (type == BooleanDataViewType.Instance)
{
columns.Add(new BooleanDataFrameColumn(column.Name));
dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Byte)
{
columns.Add(new ByteDataFrameColumn(column.Name));
dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Double)
{
columns.Add(new DoubleDataFrameColumn(column.Name));
dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Single)
{
columns.Add(new SingleDataFrameColumn(column.Name));
dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int32)
{
columns.Add(new Int32DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int64)
{
columns.Add(new Int64DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.SByte)
{
columns.Add(new SByteDataFrameColumn(column.Name));
dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int16)
{
columns.Add(new Int16DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt32)
{
columns.Add(new UInt32DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt64)
{
columns.Add(new UInt64DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt16)
{
columns.Add(new UInt16DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name));
}
else if (type == TextDataViewType.Instance)
{
columns.Add(new StringDataFrameColumn(column.Name));
dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name));
}
else
{
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
}
}

using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns))
using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns))
{
Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count];
Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count];
int columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns)
{
Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column);
Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn);
activeColumnDelegates[columnIndex] = valueGetter;
columnIndex++;
}
while (cursor.MoveNext() && cursor.Position < maxRows)
{
columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
for (int i = 0; i < activeColumnDelegates.Length; i++)
{
columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]);
columnIndex++;
dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]);
}
}
}

return new DataFrame(columns);
return new DataFrame(dataFrameColumns);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs
private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter)
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
T value = default;
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)
private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter)
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
ReadOnlyMemory<char> value = default;
Expand All @@ -489,6 +489,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D
throw new IndexOutOfRangeException(nameof(row));
}
}

protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
Expand Down
30 changes: 23 additions & 7 deletions test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -252,25 +252,41 @@ public void TestDataFrameFromIDataView_SelectColumns()
Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
}

[Fact]
public void TestDataFrameFromIDataView_SelectRows()
[Theory]
[InlineData(10, 5)]
[InlineData(110, 100)]
[InlineData(110, -1)]
public void TestDataFrameFromIDataView_SelectRows(int dataFrameSize, int rowSize)
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(dataFrameSize, withNulls: false);
df.Columns.Remove("Char"); // Because chars are returned as uint16 by DataViewSchema, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
df.Columns.Remove("Decimal"); // Because decimal is returned as double by DataViewSchema, so end up comparing DecimalDataFrameColumn to DoubleDataFrameColumn and fail asserts
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame(5);
Assert.Equal(5, newDf.Rows.Count);
DataFrame newDf;
if (rowSize == 100)
{
// Test default
newDf = dfAsIDataView.ToDataFrame();
}
else
{
newDf = dfAsIDataView.ToDataFrame(rowSize);
}
if (rowSize == -1)
{
rowSize = dataFrameSize;
}
Assert.Equal(rowSize, newDf.Rows.Count);
Assert.Equal(df.Columns.Count, newDf.Columns.Count);
for (int i = 0; i < newDf.Columns.Count; i++)
{
Assert.Equal(5, newDf.Columns[i].Length);
Assert.Equal(rowSize, newDf.Columns[i].Length);
Assert.Equal(df.Columns[i].Name, newDf.Columns[i].Name);
}
Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
for (int c = 0; c < df.Columns.Count; c++)
{
for (int r = 0; r < 5; r++)
for (int r = 0; r < rowSize; r++)
{
Assert.Equal(df.Columns[c][r], newDf.Columns[c][r]);
}
Expand Down
10 changes: 6 additions & 4 deletions test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, boo

// write the current length to (index + 1)
int offsetIndex = (i + 1) * 4;
offsetMemory[offsetIndex++] = (byte)(3 * validStringsIndex);
offsetMemory[offsetIndex++] = 0;
offsetMemory[offsetIndex++] = 0;
offsetMemory[offsetIndex++] = 0;
int offsetValue = 3 * validStringsIndex;
byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue);
offsetMemory[offsetIndex++] = offsetValueBytes[0];
offsetMemory[offsetIndex++] = offsetValueBytes[1];
offsetMemory[offsetIndex++] = offsetValueBytes[2];
offsetMemory[offsetIndex++] = offsetValueBytes[3];
}

int nullCount = withNulls ? 1 : 0;
Expand Down

0 comments on commit 1203495

Please sign in to comment.