This repository was archived by the owner on May 3, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSTANchap7.py
More file actions
119 lines (93 loc) · 4.05 KB
/
STANchap7.py
File metadata and controls
119 lines (93 loc) · 4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
"""
src:
- https://www.kaggle.com/c/overfitting
- http://timsalimans.com/winning-the-dont-overfit-competition/ (dead link)
- https://web.archive.org/web/20190718145349/http://timsalimans.com/winning-the-dont-overfit-competition/
In order to achieve this we have created a simulated data set with 200 variables and 20000 cases.
An ‘equation’ based on this data was created in order to generate a Target to be predicted.
Given the all 20000 cases, the problem is very easy to solve - but you only get given the Target value of 250 cases - the task is to build a model that gives the best predictions on the remaining 19750 cases.
"""
import numpy as np, pandas as pd, arviz as az, prince, matplotlib.pyplot as plt, seaborn as sns
from cmdstanpy import CmdStanModel
#%% load data
data = pd.read_csv("data/overfitting.csv", index_col = "case_id")
data.columns
data.info()
feature_names = data.columns.str.startswith("var_")
predictors = data[data.columns[feature_names]]
labels = data["Target_Practice"]
ix_training = data.train == 1
training_data = predictors[ix_training]
training_labels = labels[ix_training]
ix_testing = data.train == 0
testing_data = predictors[ix_testing]
testing_labels = labels[ix_testing]
sns.displot(training_data.values.flatten(), bins = "sqrt", kde = True)
pca = prince.PCA(n_components = 2, as_array = False).fit(training_data)
pca.plot_row_coordinates(training_data, color_labels = training_labels)
pca.column_correlations(training_data).plot.scatter(x = 0, y = 1) # weird column name
#%% Roshan Sharma model
mdl_data = { # problem with JSON dump => cast to python native type
"N": ix_training.sum().tolist(),
"N2": ix_testing.sum().tolist(),
"K": feature_names.sum().tolist(),
"y": training_labels.values.tolist(),
"X": training_data.values.tolist(),
"new_X": testing_data.values.tolist(),
}
modelfile = "OverfittingRoshanSharma.stan"
with open(modelfile, "w") as file: file.write("""
data {
int N; // the number of training observations
int N2; // the number of test observations
int K; // the number of features
array[N] int y; // the response
matrix[N,K] X; // the model matrix
matrix[N2,K] new_X; // the matrix for the predicted values
}
parameters { // regression parameters
real alpha;
vector[K] bbeta; // `beta` is built-in distrib fx
}
transformed parameters {
vector[N] linpred = alpha + X * bbeta;
}
model {
alpha ~ cauchy(0, 10); // prior for the intercept following Gelman 2008
bbeta ~ student_t(1, 0, 0.03);
y ~ bernoulli_logit(linpred);
}
generated quantities { // y values predicted by the model
vector[N2] y_pred = alpha + new_X * bbeta;
}
""")
var_name_array = ["alpha"] + [f"bbeta[{i+1}]" for i in range(mdl_data["K"])]
var_name_combi = ["alpha", "bbeta"]
sm = CmdStanModel(stan_file = modelfile)
# maximum likelihood estimation
optim = sm.optimize(data = mdl_data).optimized_params_pd
optim[optim.columns[~optim.columns.str.startswith("lp")]]
plt.plot(optim[var_name_array[1:]].values[0])
# variational inference
vb = sm.variational(data = mdl_data)
vb.variational_sample.columns = vb.variational_params_dict.keys()
vb_name = vb.variational_params_pd.columns[~vb.variational_params_pd.columns.str.startswith(("lp", "log_"))]
vb.variational_params_pd[var_name_array]
vb.variational_sample[var_name_array]
# Markov chain Monte Carlo
fit = sm.sample(
data = mdl_data, show_progress = True, chains = 4,
iter_sampling = 50000, iter_warmup = 10000, thin = 5
)
fit.draws().shape # iterations, chains, parameters
fit.summary().loc[var_name_array] # pandas DataFrame
print(fit.diagnose())
# posterior = {k: fit_modif.stan_variable(k) for k in var_name_combi}
az_trace = az.from_cmdstanpy(fit)
az.summary(az_trace).loc[var_name_combi] # pandas DataFrame
az.plot_trace(az_trace, var_names = ["alpha"])
az.plot_forest(az_trace, var_names = ["bbeta"])
sample_pred = fit.stan_variable("y_pred")
# Tim Salimans model: DOES NOT WORK yet
# need to figure out how to marginalize all discrete params