Generalized Additive Models¶

statsmodels require Python3.10.* (that is an older version)

In [1]:
from scipy import interpolate
import scipy as sp
import sklearn

import statsmodels as sm
from statsmodels.gam.api import GLMGam, BSplines
from statsmodels.gam.generalized_additive_model import LogitGam
from statsmodels.gam.tests.test_penalized import df_autos
from patsy import dmatrices

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
In [2]:
data=pd.read_csv('data/heart_deasease.csv')
data.head(2)
X=data.copy()
X.drop(['chd','row.names','famhist', 'obesity'],axis=1,inplace=True)
y=data.chd
In [ ]:
data_heart=pd.read_csv('data/heart_deasease.csv')
data_heart.drop(['row.names','famhist', 'obesity'],axis=1,inplace=True)

data_heart.columns

#'sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'alcohol', 'age'

x_spline = data_heart[['ldl','age', 'adiposity','typea']]
bs = BSplines(x_spline, df=[5,5,5,5], degree=[3, 3,3,3])
gam_bs = GLMGam.from_formula('chd ~ ldl + age + adiposity + typea', data=data_heart,
                                 smoother=bs)#, alpha=alpha)
res_bs = gam_bs.fit()
#alpha = np.array((10000000.0, 10000000.0, 215.44346900318845, 46415.88833612782))
gam_bs_poiss = GLMGam.from_formula('chd ~ ldl + age + adiposity + typea', data=data_heart,
                                 smoother=bs, #alpha=alpha,
                                 family=sm.genmod.families.family.Poisson())
#statsmodels.gam.generalized_additive_model.LogitGam
res_bs_poiss = gam_bs_poiss.fit()

#print(res_bs_poiss.summary())
#gam_bs.select_penweight()[0]
#gam_bs.select_penweight_kfold()[0]
In [4]:
#res_bs_poiss.__dict__['_results'].__dict__['model'].__dict__.keys()
In [5]:
X
Out[5]:
sbp tobacco ldl adiposity typea alcohol age
0 160 12.00 5.73 23.11 49 97.20 52
1 144 0.01 4.41 28.61 55 2.06 63
2 118 0.08 3.48 32.28 52 3.81 46
3 170 7.50 6.41 38.03 51 24.26 58
4 134 13.60 3.50 27.78 60 57.34 49
... ... ... ... ... ... ... ...
457 214 0.40 5.98 31.72 64 0.00 58
458 182 4.20 4.41 32.10 52 18.72 52
459 108 3.00 1.59 15.23 40 26.64 55
460 118 5.40 11.61 30.79 64 23.97 40
461 132 0.00 4.82 33.41 62 0.00 46

462 rows × 7 columns

In [32]:
alpha = np.array((1.0, 1.0, 1.0))#, 215.4446900318845, 46415.88833612782))
#gam_bs = LogitGam.from_formula('chd ~ ldl + age + adiposity + typea', data=data_heart)#, smoother=bs,alpha=alpha)
y,X=dmatrices('chd ~ ldl + age + adiposity', data=data_heart,return_type='dataframe')
x_spline2 = data_heart[['ldl','age','adiposity']]
bs2 = BSplines(x_spline2, df=[8,8,8], degree=[3, 3,3])

log_m=LogitGam(y, smoother=bs2,alpha=alpha)
#res_lm=log_m.fit()
fit=log_m.fit_regularized()
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5426514025933599
            Iterations: 173
            Function evaluations: 173
            Gradient evaluations: 173
In [ ]:
#fig, ax_lst = plt.subplots(2, 3)  # 3 rows, 1 column; numbered row-wise
for i in range(3):
    res_bs.plot_partial(i, cpr=True)
In [10]:
alp=gam_bs_poiss.select_penweight_kfold(alphas=None, cv_iterator=None, cost=None, k_folds=5, k_grid=4)
alp
Out[10]:
((46415.88833612782, 10000000.0, 215.44346900318845, 46415.88833612782),
 <statsmodels.gam.gam_cross_validation.gam_cross_validation.MultivariateGAMCVPath at 0x19ec662f910>)