Decision trees¶

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss

from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge
from sklearn.tree import DecisionTreeClassifier,export_text, DecisionTreeRegressor

from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor

from mlxtend.plotting import plot_decision_regions

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression

#from pyearth import Earth

from matplotlib.backends.backend_pdf import PdfPages
import os
fig_dir='figures'
fig_file = 'figures'+'//tree_figures.pdf'

if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
try:
    os.remove(fig_file)
except OSError:
    pass

import warnings
warnings.simplefilter("ignore")

Classification, SPAM data¶

https://hastie.su.domains/ElemStatLearn/

In [2]:
pp = PdfPages(fig_file)

colNames=['make','address','all','3d','our','over','remove','internet','order','mail','recieve','will','people','report','addresses','free','business','email','you','credit','your','font','000','money','hp','hpl','george','650','lab','labs','telnet','857','data','415','85','technology','1999','parts','pm','direct','cs','meeting','original','project','re','edu','table','confrence',';','(','[','!','$','#','CAPAVE','CAPMAX','CAPTOT',"SPAM"]

spam= pd.read_csv('data/spam.csv',sep=' ',decimal='.',names=colNames)#,na_values='.')
test_id=pd.read_csv('data/spam.traintest.csv',sep=' ',decimal='.',names=['test'],dtype=bool)
spam.shape, test_id.shape
Out[2]:
((4601, 58), (4601, 1))
In [3]:
spam_train=spam[~test_id.test]
spam_test=spam[test_id.test]
spam_train.shape, spam_test.shape
Out[3]:
((3065, 58), (1536, 58))
In [4]:
spam_train.columns[:-1]
Out[4]:
Index(['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet',
       'order', 'mail', 'recieve', 'will', 'people', 'report', 'addresses',
       'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000',
       'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
       'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
       'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
       'confrence', ';', '(', '[', '!', '$', '#', 'CAPAVE', 'CAPMAX',
       'CAPTOT'],
      dtype='object')
In [5]:
max_depth=3
tree_clf=DecisionTreeClassifier(criterion='entropy',max_depth=max_depth)  #(class_weight='balanced', criterion {“gini”, “entropy”, “log_loss”}, default=”gini”

tree_clf.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])   

fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(tree_clf, 
                   feature_names=spam_train.columns[:-1],
                   class_names=[str(tree_clf.classes_[i]) for i in range(2)],
                   filled=True)
plt.title(f"Spam data, max_depth={max_depth}")
plt.savefig(pp, format='pdf')
plt.show()
In [6]:
print(export_text(tree_clf,feature_names=spam_train.columns[:-1]))
|--- $ <= 0.06
|   |--- remove <= 0.06
|   |   |--- ! <= 0.19
|   |   |   |--- class: 0
|   |   |--- ! >  0.19
|   |   |   |--- class: 0
|   |--- remove >  0.06
|   |   |--- george <= 0.15
|   |   |   |--- class: 1
|   |   |--- george >  0.15
|   |   |   |--- class: 0
|--- $ >  0.06
|   |--- hp <= 0.41
|   |   |--- CAPAVE <= 2.91
|   |   |   |--- class: 1
|   |   |--- CAPAVE >  2.91
|   |   |   |--- class: 1
|   |--- hp >  0.41
|   |   |--- remove <= 0.08
|   |   |   |--- class: 0
|   |   |--- remove >  0.08
|   |   |   |--- class: 1

In [7]:
path = tree_clf.cost_complexity_pruning_path(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set SPAM")
plt.savefig(pp, format='pdf')
In [8]:
n_trees=8
print(f"alphas = [{ccp_alphas[(-n_trees):]}]")
clfs = []
for ccp_alpha in ccp_alphas[(-n_trees):]:
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    clf.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))
alphas = [[0.         0.0071613  0.01196417 0.01226356 0.03851347 0.08355956
 0.12041694 0.25867989]]
Number of nodes in the last tree is: 1 with ccp_alpha: 0.25867988674499065
In [9]:
train_scores = [clf.score(spam_train.iloc[:, :-1], spam_train.iloc[:, -1]) for clf in clfs]
test_scores = [clf.score(spam_test.iloc[:, :-1], spam_test.iloc[:, -1]) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Score vs alpha for training and testing sets (SPAM)")
ax.plot(ccp_alphas[(-n_trees):], train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas[(-n_trees):], test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
ax.set_ylim(0,1)
plt.savefig(pp, format='pdf')

plt.show()
In [10]:
clf_best = DecisionTreeClassifier(ccp_alpha=0.01196417)
clf_best.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best, 
                   feature_names=spam_train.columns[:-1],
                   class_names=[str(tree_clf.classes_[i]) for i in range(2)],
                   filled=True)
plt.title(f"Spam data, selected tree")

plt.savefig(pp, format='pdf')

plt.show()
In [11]:
#pred=tree_clf.predict(spam_test.iloc[:, :-1])
pred=clf_best.predict(spam_test.iloc[:, :-1])

conf=pd.DataFrame({'pred':pred,'real':spam_test.iloc[:, -1]}).groupby(['pred','real'])['pred'].count().unstack()
print(f"Test accuracy: {(np.matrix(conf.values).trace()/conf.sum().sum())[0,0]:5.2}")
conf
#
Test accuracy:  0.89
Out[11]:
real 0 1
pred
0 890 117
1 51 478
In [12]:
(np.matrix(conf.values).trace()/conf.sum().sum())[0,0]
Out[12]:
0.890625

Regression: California Housing dataset¶

https://www.tu-chemnitz.de/mathematik/numa/lehre/ds-2019/Exercises/ps14/housing.csv

In [13]:
housing= pd.read_csv('data/housing.csv',sep=',',decimal='.')

housing_target='median_house_value'


X, y = (housing.drop(columns=[housing_target]), housing[housing_target])
X_train0, X_test0, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.2)
X_train0.shape, X_test0.shape
Out[13]:
((16512, 9), (4128, 9))
In [14]:
X.columns
Out[14]:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')
In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer,StandardScaler
from sklearn.preprocessing import OneHotEncoder

ct_housing= ColumnTransformer(
    transformers=[
        ('', OneHotEncoder(drop='first'), ['ocean_proximity']),
#        ('s',StandardScaler(), [0,1,3,4,5,6,7])
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
    )
X_train = pd.DataFrame(ct_housing.fit_transform(X_train0),columns=ct_housing.get_feature_names_out(input_features=X_train0.columns))
X_test = pd.DataFrame(ct_housing.transform(X_test0),columns=ct_housing.get_feature_names_out(input_features=X_train0.columns))
In [16]:
X_train0.shape, y_train.shape
Out[16]:
((16512, 9), (16512,))
In [17]:
max_depth=3
tree_reg=DecisionTreeRegressor(max_depth=max_depth)
tree_reg.fit(X_train, y_train)   

fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(tree_reg, 
                   feature_names=X_train.columns,
                   filled=True)
ax.set_title(f"Housing data, max_depth={max_depth}")

plt.savefig(pp, format='pdf')
plt.show()
In [18]:
path = tree_reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.savefig(pp, format='pdf')
In [19]:
n_trees=8
print(f"alphas = [{ccp_alphas[(-n_trees):]}]")
clfs = []
for ccp_alpha in ccp_alphas[(-n_trees):]:
    clf = DecisionTreeRegressor(ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {:_}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))
alphas = [[0.00000000e+00 6.22752264e+07 1.63123174e+08 1.85336107e+08
 5.82095255e+08 7.01359039e+08 1.78920002e+09 4.11494199e+09]]
Number of nodes in the last tree is: 1 with ccp_alpha: 4_114_941_985.1079044
In [20]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
#ax.set_ylabel("accuracy")
ax.set_title("$R^2$ score vs alpha for training and testing sets")
ax.plot(ccp_alphas[(-n_trees):], train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas[(-n_trees):], test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.set_ylabel("$R^2$")
ax.legend()
plt.savefig(pp, format='pdf')
plt.show()

Selected Trees for specific alpha¶

In [21]:
clf_best_s = DecisionTreeRegressor(ccp_alpha=1.78920002e+09)
clf_best_s.fit(X_train, y_train)
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best_s, 
                   feature_names=X_train.columns,
                   filled=True)
plt.title(f"Housing data, selected small tree")
plt.savefig(pp, format='pdf')
plt.show()
In [22]:
clf_best = DecisionTreeRegressor(ccp_alpha=6.22752264e+07)
clf_best.fit(X_train, y_train)
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best, 
                   feature_names=X_train.columns,
                   filled=True)
plt.title(f"Housing data, selected larger tree")
plt.savefig(pp, format='pdf')
plt.show()

Mean Square Error Comparison¶

In [23]:
from sklearn.metrics import mean_squared_error

print(f"Predict constant:             {mean_squared_error([np.mean(y_test)]*len(y_test), y_test):_} ")
print(f"{len(clf_best_s.tree_.value):5}  node tree test error:   {mean_squared_error(clf_best_s.predict(X_test), y_test):_} ")
print(f"{len(clf_best.tree_.value):5}  node tree test error:   {mean_squared_error(clf_best.predict(X_test), y_test):_} ")

#mean_squared_error(y_true, y_pred)
Predict constant:             13_008_246_639.623358 
    5  node tree test error:   7_330_562_846.985989 
   33  node tree test error:   4_819_060_610.023689 
In [24]:
pp.close() #stop saving