Skip to content

ensure reproducibility of negative bionomial on toy dataset with R #386

@pavanramkumar

Description

@pavanramkumar

Dataset: https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/

R code:

require(foreign)
require(ggplot2)
require(MASS)

dat <- read.dta("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
    prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
    id <- factor(id)
})

ggplot(dat, aes(daysabs, fill = prog)) + geom_histogram(binwidth = 1) + facet_grid(prog ~ 
    ., margins = TRUE, scales = "free")

with(dat, tapply(daysabs, prog, function(x) {
    sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))

summary(m1 <- glm.nb(daysabs ~ math + prog, data = dat))

Output:

glm.nb(formula = daysabs ~ math + prog, data = dat, init.theta = 1.032713156, 
    link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1547  -1.0192  -0.3694   0.2285   2.5273  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)
(Intercept)     2.615265   0.197460  13.245  < 2e-16
math           -0.005993   0.002505  -2.392   0.0167
progAcademic   -0.440760   0.182610  -2.414   0.0158
progVocational -1.278651   0.200720  -6.370 1.89e-10

Python code:

##########################################################
#
# GLM with negative binomial distribution
#
# This is an example of GLM with negative binomial distribution.
# In this example, we will be using an example from R community below
# https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/
#
# Here, we would like to predict the days absense of high school juniors
# at two schools from there type of program they are enrolled, and their math
# score.


##########################################################
# Import relevance libraries

import pandas as pd
from pyglmnet import GLM

import matplotlib.pyplot as plt

##########################################################
# Read and preprocess data

df = pd.read_stata("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")
print(df.head())

# histogram of type of program they are enrolled
df.hist(column='daysabs', by=['prog'])
plt.show()

# print mean and standard deviation for each program enrolled
print(df.groupby('prog').agg({'daysabs': ['mean', 'std']}))

##########################################################
# Feature

X = df.drop('daysabs', axis=1)
y = df['daysabs'].values

# design matrix
program_df = pd.get_dummies(df.prog)
Xdsgn = pd.concat((df['math'], program_df.drop(1.0, axis=1)), axis=1).values

##########################################################
# Predict using GLM

# theta = 0.968
theta = 1.033
# theta = 5.
distr = 'neg-binomial'
# distr = 'softplus'

glm_neg_bino = GLM(distr=distr,
                   alpha=0.0, reg_lambda=0.0, max_iter=1000, solver='cdfast',
                   score_metric='pseudo_R2', theta=theta, learning_rate=10)
glm_neg_bino.fit(Xdsgn, y)
yhat = glm_neg_bino.predict(Xdsgn)
print(glm_neg_bino.beta0_, glm_neg_bino.beta_)
print(glm_neg_bino.score(Xdsgn, yhat))

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions