Skip to content

Commit 4088620

Browse files
Addresses #3976 about using PFI with a model loaded from disk (#4262)
* Address the issue of using PFI with a model loaded from disk. * Provided working tests and samples for using PFI with a model loaded from disk for the cases of Ranking, Regression, and Multiclass prediction transformers. No tests or samples provided for Binary classification, for reasons that will be addressed in a future issue. * Also modified LbfgsTests so that it uses the appropiate casts now that the PredictionTransformers have been updated.
1 parent 718a238 commit 4088620

File tree

7 files changed

+756
-49
lines changed

7 files changed

+756
-49
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML;
5+
using Microsoft.ML.Calibrators;
6+
using Microsoft.ML.Data;
7+
using Microsoft.ML.Trainers;
8+
9+
namespace Samples.Dynamic.Trainers.MulticlassClassification
10+
{
11+
public static class PermutationFeatureImportanceLoadFromDisk
12+
{
13+
public static void Example()
14+
{
15+
// Create a new context for ML.NET operations. It can be used for
16+
// exception tracking and logging, as a catalog of available operations
17+
// and as the source of randomness.
18+
var mlContext = new MLContext(seed: 1);
19+
20+
// Create sample data.
21+
var samples = GenerateData();
22+
23+
// Load the sample data as an IDataView.
24+
var data = mlContext.Data.LoadFromEnumerable(samples);
25+
26+
// Define a training pipeline that concatenates features into a vector,
27+
// normalizes them, and then trains a linear model.
28+
var featureColumns =
29+
new string[] { nameof(Data.Feature1), nameof(Data.Feature2) };
30+
31+
var pipeline = mlContext.Transforms
32+
.Concatenate("Features", featureColumns)
33+
.Append(mlContext.Transforms.Conversion.MapValueToKey("Label"))
34+
.Append(mlContext.Transforms.NormalizeMinMax("Features"))
35+
.Append(mlContext.MulticlassClassification.Trainers
36+
.SdcaMaximumEntropy());
37+
38+
// Fit the pipeline to the data and save the model
39+
var model0 = pipeline.Fit(data);
40+
var modelPath = "./model0.zip";
41+
mlContext.Model.Save(model0, data.Schema, modelPath);
42+
43+
// Load the model
44+
var model = mlContext.Model.Load(modelPath, out var schema);
45+
46+
// Transform the dataset.
47+
var transformedData = model.Transform(data);
48+
49+
// Extract the predictor.
50+
var linearPredictor = (model as TransformerChain<ITransformer>).LastTransformer as MulticlassPredictionTransformer<MaximumEntropyModelParameters>;
51+
52+
// Compute the permutation metrics for the linear model using the
53+
// normalized data.
54+
var permutationMetrics = mlContext.MulticlassClassification
55+
.PermutationFeatureImportance(linearPredictor, transformedData,
56+
permutationCount: 30);
57+
58+
// Now let's look at which features are most important to the model
59+
// overall. Get the feature indices sorted by their impact on
60+
// microaccuracy.
61+
var sortedIndices = permutationMetrics
62+
.Select((metrics, index) => new { index, metrics.MicroAccuracy })
63+
.OrderByDescending(feature => Math.Abs(feature.MicroAccuracy.Mean))
64+
.Select(feature => feature.index);
65+
66+
Console.WriteLine("Feature\tChange in MicroAccuracy\t95% Confidence in "
67+
+ "the Mean Change in MicroAccuracy");
68+
69+
var microAccuracy = permutationMetrics.Select(x => x.MicroAccuracy)
70+
.ToArray();
71+
72+
foreach (int i in sortedIndices)
73+
{
74+
Console.WriteLine("{0}\t{1:G4}\t{2:G4}",
75+
featureColumns[i],
76+
microAccuracy[i].Mean,
77+
1.96 * microAccuracy[i].StandardError);
78+
}
79+
80+
// Expected output:
81+
//Feature Change in MicroAccuracy 95% Confidence in the Mean Change in MicroAccuracy
82+
//Feature2 -0.1396 0.0008036
83+
//Feature1 -0.05421 0.0006154
84+
85+
}
86+
87+
private class Data
88+
{
89+
public float Label { get; set; }
90+
91+
public float Feature1 { get; set; }
92+
93+
public float Feature2 { get; set; }
94+
}
95+
96+
/// <summary>
97+
/// Generate an enumerable of Data objects, creating the label as a simple
98+
/// linear combination of the features.
99+
/// </summary>
100+
/// <param name="nExamples">The number of examples.</param>
101+
/// <param name="bias">The bias, or offset, in the calculation of the
102+
/// label.</param>
103+
/// <param name="weight1">The weight to multiply the first feature with to
104+
/// compute the label.</param>
105+
/// <param name="weight2">The weight to multiply the second feature with to
106+
/// compute the label.</param>
107+
/// <param name="seed">The seed for generating feature values and label
108+
/// noise.</param>
109+
/// <returns>An enumerable of Data objects.</returns>
110+
private static IEnumerable<Data> GenerateData(int nExamples = 10000,
111+
double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1)
112+
{
113+
var rng = new Random(seed);
114+
var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5;
115+
for (int i = 0; i < nExamples; i++)
116+
{
117+
var data = new Data
118+
{
119+
Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)),
120+
Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)),
121+
};
122+
123+
// Create a noisy label.
124+
var value = (float)
125+
(bias + weight1 * data.Feature1 + weight2 * data.Feature2 +
126+
rng.NextDouble() - 0.5);
127+
128+
if (value < max / 3)
129+
data.Label = 0;
130+
else if (value < 2 * max / 3)
131+
data.Label = 1;
132+
else
133+
data.Label = 2;
134+
yield return data;
135+
}
136+
}
137+
}
138+
}
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML;
5+
using Microsoft.ML.Data;
6+
using Microsoft.ML.Trainers.FastTree;
7+
8+
namespace Samples.Dynamic.Trainers.Ranking
9+
{
10+
public static class PermutationFeatureImportanceLoadFromDisk
11+
{
12+
public static void Example()
13+
{
14+
// Create a new context for ML.NET operations. It can be used for
15+
// exception tracking and logging, as a catalog of available operations
16+
// and as the source of randomness.
17+
var mlContext = new MLContext(seed: 1);
18+
19+
// Create sample data.
20+
var samples = GenerateData();
21+
22+
// Load the sample data as an IDataView.
23+
var data = mlContext.Data.LoadFromEnumerable(samples);
24+
25+
// Define a training pipeline that concatenates features into a vector,
26+
// normalizes them, and then trains a linear model.
27+
var featureColumns = new string[] { nameof(Data.Feature1), nameof(
28+
Data.Feature2) };
29+
var pipeline = mlContext.Transforms.Concatenate("Features",
30+
featureColumns)
31+
.Append(mlContext.Transforms.Conversion.MapValueToKey("Label"))
32+
.Append(mlContext.Transforms.Conversion.MapValueToKey(
33+
"GroupId"))
34+
.Append(mlContext.Transforms.NormalizeMinMax("Features"))
35+
.Append(mlContext.Ranking.Trainers.FastTree());
36+
37+
// Train the model and save to disk
38+
var model0 = pipeline.Fit(data);
39+
var modelPath = "./model0.zip";
40+
mlContext.Model.Save(model0, data.Schema, modelPath);
41+
42+
// Load model
43+
var model = mlContext.Model.Load(modelPath, out var schema);
44+
45+
// Transform Data
46+
var transformedData = model.Transform(data);
47+
48+
// Extract the predictor
49+
var linearPredictor = (model as TransformerChain<ITransformer>).LastTransformer as RankingPredictionTransformer<FastTreeRankingModelParameters>;
50+
51+
// Compute the permutation metrics for the linear model using the
52+
// normalized data.
53+
var permutationMetrics = mlContext.Ranking.PermutationFeatureImportance(
54+
linearPredictor, transformedData, permutationCount: 30);
55+
56+
// Now let's look at which features are most important to the model
57+
// overall. Get the feature indices sorted by their impact on NDCG@1.
58+
var sortedIndices = permutationMetrics.Select((metrics, index) => new {
59+
index,
60+
metrics.NormalizedDiscountedCumulativeGains
61+
})
62+
.OrderByDescending(feature => Math.Abs(
63+
feature.NormalizedDiscountedCumulativeGains[0].Mean))
64+
65+
.Select(feature => feature.index);
66+
67+
Console.WriteLine("Feature\tChange in NDCG@1\t95% Confidence in the" +
68+
"Mean Change in NDCG@1");
69+
var ndcg = permutationMetrics.Select(
70+
x => x.NormalizedDiscountedCumulativeGains).ToArray();
71+
foreach (int i in sortedIndices)
72+
{
73+
Console.WriteLine("{0}\t{1:G4}\t{2:G4}",
74+
featureColumns[i],
75+
ndcg[i][0].Mean,
76+
1.96 * ndcg[i][0].StandardError);
77+
}
78+
79+
// Expected output:
80+
// Feature Change in NDCG@1 95% Confidence in the Mean Change in NDCG@1
81+
// Feature2 -0.2432 0.001762
82+
// Feature1 -0.05235 0.001116
83+
}
84+
85+
private class Data
86+
{
87+
public float Label { get; set; }
88+
89+
public int GroupId { get; set; }
90+
91+
public float Feature1 { get; set; }
92+
93+
public float Feature2 { get; set; }
94+
}
95+
96+
/// <summary>
97+
/// Generate an enumerable of Data objects, creating the label as a simple
98+
/// linear combination of the features.
99+
/// </summary>
100+
///
101+
/// <param name="nExamples">The number of examples.</param>
102+
///
103+
/// <param name="bias">The bias, or offset, in the calculation of the label.
104+
/// </param>
105+
///
106+
/// <param name="weight1">The weight to multiply the first feature with to
107+
/// compute the label.</param>
108+
///
109+
/// <param name="weight2">The weight to multiply the second feature with to
110+
/// compute the label.</param>
111+
///
112+
/// <param name="seed">The seed for generating feature values and label
113+
/// noise.</param>
114+
///
115+
/// <returns>An enumerable of Data objects.</returns>
116+
private static IEnumerable<Data> GenerateData(int nExamples = 10000,
117+
double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1,
118+
int groupSize = 5)
119+
{
120+
var rng = new Random(seed);
121+
var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5;
122+
for (int i = 0; i < nExamples; i++)
123+
{
124+
var data = new Data
125+
{
126+
GroupId = i / groupSize,
127+
Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)),
128+
Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)),
129+
};
130+
131+
// Create a noisy label.
132+
var value = (float)(bias + weight1 * data.Feature1 + weight2 *
133+
data.Feature2 + rng.NextDouble() - 0.5);
134+
if (value < max / 3)
135+
data.Label = 0;
136+
else if (value < 2 * max / 3)
137+
data.Label = 1;
138+
else
139+
data.Label = 2;
140+
yield return data;
141+
}
142+
}
143+
}
144+
}

0 commit comments

Comments
 (0)