Linear models for classification¶

In [1]:
from scipy import interpolate
import scipy as sp
import sklearn

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore")

Datasets¶

  • Heart deasease

  • Vowel

    Both from ESLII https://hastie.su.domains/ElemStatLearn/

In [2]:
data_heart=pd.read_csv('data/heart_deasease.csv')
X_heart=data_heart.copy()
X_heart.drop(['chd','row.names','famhist', 'obesity'],axis=1,inplace=True)
y_heart=data_heart.chd


data_vowel=pd.read_csv('data/vowel.csv')
X_vowel=data_vowel.copy()
X_vowel.drop(['row.names','y'],axis=1,inplace=True)
y_vowel=data_vowel.y

Logistic regression¶

In [3]:
from sklearn.linear_model import LogisticRegression
Out[3]:
0.6666666666666666

One dimensional regression¶

In [15]:
lr=LogisticRegression(penalty='none')
columns=['sbp']
X=X_heart[columns]
y=y_heart
X_predict=X_heart[columns]
lr.fit(X,y)

plt.scatter(X_heart[columns],lr.predict_proba(X_predict)[:,1])
plt.title('P(chd | {})'.format(columns))
plt.ylabel('Predicted Probability')
plt.xlabel(columns)

lr.score(X_predict, y)
Out[15]:
0.6666666666666666

Multivariate Logistic Regression¶

In [16]:
X=X_heart
y=y_heart
X_predict=X_heart
lr.fit(X,y)

lr.predict_proba(X_predict)
lr.score(X_predict, y)
Out[16]:
0.7164502164502164

Partial Dependence Display¶

In [6]:
from sklearn.inspection import PartialDependenceDisplay
features = [0, 2, (0, 2)]
PartialDependenceDisplay.from_estimator(lr, X, features)
Out[6]:
<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x2539e29d3f0>

Penalized Logistic Regression¶

In [7]:
lr_full=LogisticRegression(penalty='l2')
lr_full.fit(X,y)
coef=pd.Series(lr_full.coef_[0],index=lr_full.feature_names_in_)
coef
Out[7]:
sbp          0.005275
tobacco      0.073312
ldl          0.190721
adiposity   -0.011366
typea        0.038391
alcohol      0.001693
age          0.055438
dtype: float64

Vowel Data¶

  • more categories, more fun
In [19]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

scaler = preprocessing.StandardScaler().fit(X_vowel)
X_scaled = scaler.transform(X_vowel)
lr_vowel=LogisticRegression(penalty='l2')
lr_vowel.fit(X_scaled, y_vowel)

pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_vowel, y_vowel);
In [9]:
from mlxtend.plotting import plot_decision_regions

fill=dict(pd.Series(X_scaled.mean(axis=0)[2:],index=range(2,10)))

plot_decision_regions(X_scaled, y_vowel.values, 
                      clf=lr_vowel, filler_feature_values=fill,legend=2)
Out[9]:
<AxesSubplot:>
In [10]:
X_scaled.mean(axis=0)[2:]
Out[10]:
array([ 0.00000000e+00,  5.38289951e-17, -2.69144976e-17,  2.69144976e-17,
       -1.34572488e-17, -8.07434927e-17,  0.00000000e+00, -2.69144976e-17])

Nearest Neighbours¶

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import display, HTML

clf = KNeighborsClassifier(n_neighbors=15)

clf.fit(X_vowel, y_vowel)                        ################### fit
plot_decision_regions(X_scaled, y_vowel.values, 
                      clf=clf, filler_feature_values=fill,legend=2)

plt.title('Train data Knn with K={}'.format(2))
Out[17]:
Text(0.5, 1.0, 'Train data Knn with K=2')
In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001)
lda=LinearDiscriminantAnalysis()
lda.fit(X_vowel, y_vowel)                        
plot_decision_regions(X_scaled, y_vowel.values, 
                      clf=lda, filler_feature_values=fill,legend=2)
plt.title('Linear Discriminant Analysis');

Sklearn just splines, GAM only in statsmodels¶

  • spline interpolation example
In [13]:
def f(x):
    x_points = [ 0, 1, 2, 3, 4, 5]
    y_points = [12,14,22,39,27,15]

    tck = sp.interpolate.make_interp_spline(x_points, y_points,k=3)
    return interpolate.splev(x, tck)

print(f(1.25))
14.718750000000004