Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
namespace Microsoft.Data.Analysis
{
public partial class DataFrame : IDataView
{
{
// TODO: support shuffling
bool IDataView.CanShuffle => false;

Expand Down
7 changes: 7 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,13 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// </param>
protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException();

/// <summary>
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
/// Clamps values beyond the specified thresholds
/// </summary>
Expand Down
115 changes: 115 additions & 0 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Microsoft.Data.Analysis
Comment thread
pgovind marked this conversation as resolved.
Outdated
{
public static class IDataViewExtensions
{
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1)
Comment thread
pgovind marked this conversation as resolved.
Outdated
{
return ToDataFrame(dataView, maxRows, null);
}

public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, -1, selectColumns);
}

public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column column in schema)
{
long length = maxRows >= 0 ? maxRows : long.MaxValue;
length = Math.Min(length, dataView.GetRowCount() ?? 0);
Comment thread
pgovind marked this conversation as resolved.
Outdated
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
{
continue;
}

activeColumns.Add(column);
DataViewType type = column.Type;
if (type == BooleanDataViewType.Instance)
{
columns.Add(new BooleanDataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Byte)
{
columns.Add(new ByteDataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Double)
{
columns.Add(new DoubleDataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Single)
{
columns.Add(new SingleDataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Int32)
{
columns.Add(new Int32DataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Int64)
{
columns.Add(new Int64DataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.SByte)
{
columns.Add(new SByteDataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.Int16)
{
columns.Add(new Int16DataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.UInt32)
{
columns.Add(new UInt32DataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.UInt64)
{
columns.Add(new UInt64DataFrameColumn(column.Name, length));
}
else if (type == NumberDataViewType.UInt16)
{
columns.Add(new UInt16DataFrameColumn(column.Name, length));
}
else if (type == TextDataViewType.Instance)
{
columns.Add(new StringDataFrameColumn(column.Name, length));
}
else
{
throw new NotSupportedException(nameof(type));
Comment thread
pgovind marked this conversation as resolved.
Outdated
}
}

DataFrame ret = new DataFrame(columns);
DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns);
Comment thread
pgovind marked this conversation as resolved.
Outdated
while (cursor.MoveNext())
{
foreach (DataViewSchema.Column column in activeColumns)
{
ret[column.Name].AddValueUsingCursor(cursor, column);
Comment thread
pgovind marked this conversation as resolved.
Outdated
}
}

return ret;
}
}

}
25 changes: 25 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -775,5 +775,30 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs

private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

private ValueGetter<T> getter = null;

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column)
{
long row = cursor.Position;
T value = default;
if (getter == null)
{
getter = cursor.GetGetter<T>(column);
}
Comment thread
pgovind marked this conversation as resolved.
Outdated
getter(ref value);
if (Length > row)
{
this[row] = value;
}
else if (Length == row)
{
Append(value);
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}
}
}
25 changes: 25 additions & 0 deletions src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -467,5 +467,30 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)

private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();

private ValueGetter<ReadOnlyMemory<char>> getter = null;
Comment thread
pgovind marked this conversation as resolved.
Outdated

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
long row = cursor.Position;
ReadOnlyMemory<char> value = default;
if (getter == null)
{
getter = cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
}
getter(ref value);
if (Length > row)
{
this[row] = value.ToString();
}
else if (Length == row)
{
Append(value.ToString());
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}
}
}
62 changes: 58 additions & 4 deletions test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

namespace Microsoft.Data.Analysis.Tests
{
public partial class DataFrameTests
public partial class DataFrameIDataViewTests
Comment thread
pgovind marked this conversation as resolved.
{
[Fact]
public void TestIDataView()
{
IDataView dataView = MakeDataFrameWithAllColumnTypes(10, withNulls: false);
IDataView dataView = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);

DataDebuggerPreview preview = dataView.Preview();
Assert.Equal(10, preview.RowView.Length);
Expand Down Expand Up @@ -85,7 +85,7 @@ public void TestIDataView()
[Fact]
public void TestIDataViewSchemaInvalidate()
{
DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10, withNulls: false);
DataFrame df = DataFrameTests.MakeDataFrameWithAllMutableColumnTypes(10, withNulls: false);

IDataView dataView = df;

Expand Down Expand Up @@ -113,7 +113,7 @@ public void TestIDataViewSchemaInvalidate()
public void TestIDataViewWithNulls()
{
int length = 10;
IDataView dataView = MakeDataFrameWithAllColumnTypes(length, withNulls: true);
IDataView dataView = DataFrameTests.MakeDataFrameWithAllColumnTypes(length, withNulls: true);

DataDebuggerPreview preview = dataView.Preview();
Assert.Equal(length, preview.RowView.Length);
Expand Down Expand Up @@ -224,5 +224,59 @@ public void TestIDataViewWithNulls()
Assert.Equal("", preview.ColumnView[14].Values[5].ToString()); // null row
Assert.Equal("foo", preview.ColumnView[14].Values[6].ToString());
}

[Fact]
public void TestDataFrameFromIDataView()
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
Comment thread
pgovind marked this conversation as resolved.
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame();
Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count);
Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
for (int i = 0; i < df.Columns.Count; i++)
{
Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All());
}
}

[Fact]
public void TestDataFrameFromIDataView_SelectColumns()
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame("Int", "Double");
Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count);
Assert.Equal(2, newDf.Columns.Count);
Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All());
Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
}

[Fact]
public void TestDataFrameFromIDataView_SelectRows()
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame(5);
Assert.Equal(5, newDf.Rows.Count);
Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
for (int i = 0; i < df.Columns.Count; i++)
{
Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All());
}
}

[Fact]
public void TestDataFrameFromIDataView_SelectColumnsAndRows()
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
Comment thread
pgovind marked this conversation as resolved.
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame(5, "Int", "Double");
Assert.Equal(5, newDf.Rows.Count);
Assert.Equal(2, newDf.Columns.Count);
Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All());
Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
}
}
}