import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge
from sklearn.tree import DecisionTreeClassifier,export_text, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from mlxtend.plotting import plot_decision_regions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression
#from pyearth import Earth
from matplotlib.backends.backend_pdf import PdfPages
import os
fig_dir='figures'
fig_file = 'figures'+'//tree_figures.pdf'
if not os.path.exists(fig_dir):
os.makedirs(fig_dir)
try:
os.remove(fig_file)
except OSError:
pass
import warnings
warnings.simplefilter("ignore")
pp = PdfPages(fig_file)
colNames=['make','address','all','3d','our','over','remove','internet','order','mail','recieve','will','people','report','addresses','free','business','email','you','credit','your','font','000','money','hp','hpl','george','650','lab','labs','telnet','857','data','415','85','technology','1999','parts','pm','direct','cs','meeting','original','project','re','edu','table','confrence',';','(','[','!','$','#','CAPAVE','CAPMAX','CAPTOT',"SPAM"]
spam= pd.read_csv('data/spam.csv',sep=' ',decimal='.',names=colNames)#,na_values='.')
test_id=pd.read_csv('data/spam.traintest.csv',sep=' ',decimal='.',names=['test'],dtype=bool)
spam.shape, test_id.shape
((4601, 58), (4601, 1))
spam_train=spam[~test_id.test]
spam_test=spam[test_id.test]
spam_train.shape, spam_test.shape
((3065, 58), (1536, 58))
spam_train.columns[:-1]
Index(['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'recieve', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'confrence', ';', '(', '[', '!', '$', '#', 'CAPAVE', 'CAPMAX', 'CAPTOT'], dtype='object')
max_depth=3
tree_clf=DecisionTreeClassifier(criterion='entropy',max_depth=max_depth) #(class_weight='balanced', criterion {“gini”, “entropy”, “log_loss”}, default=”gini”
tree_clf.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(tree_clf,
feature_names=spam_train.columns[:-1],
class_names=[str(tree_clf.classes_[i]) for i in range(2)],
filled=True)
plt.title(f"Spam data, max_depth={max_depth}")
plt.savefig(pp, format='pdf')
plt.show()
print(export_text(tree_clf,feature_names=spam_train.columns[:-1]))
|--- $ <= 0.06 | |--- remove <= 0.06 | | |--- ! <= 0.19 | | | |--- class: 0 | | |--- ! > 0.19 | | | |--- class: 0 | |--- remove > 0.06 | | |--- george <= 0.15 | | | |--- class: 1 | | |--- george > 0.15 | | | |--- class: 0 |--- $ > 0.06 | |--- hp <= 0.41 | | |--- CAPAVE <= 2.91 | | | |--- class: 1 | | |--- CAPAVE > 2.91 | | | |--- class: 1 | |--- hp > 0.41 | | |--- remove <= 0.08 | | | |--- class: 0 | | |--- remove > 0.08 | | | |--- class: 1
path = tree_clf.cost_complexity_pruning_path(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set SPAM")
plt.savefig(pp, format='pdf')
n_trees=8
print(f"alphas = [{ccp_alphas[(-n_trees):]}]")
clfs = []
for ccp_alpha in ccp_alphas[(-n_trees):]:
clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
clf.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]))
alphas = [[0. 0.0071613 0.01196417 0.01226356 0.03851347 0.08355956 0.12041694 0.25867989]] Number of nodes in the last tree is: 1 with ccp_alpha: 0.25867988674499065
train_scores = [clf.score(spam_train.iloc[:, :-1], spam_train.iloc[:, -1]) for clf in clfs]
test_scores = [clf.score(spam_test.iloc[:, :-1], spam_test.iloc[:, -1]) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Score vs alpha for training and testing sets (SPAM)")
ax.plot(ccp_alphas[(-n_trees):], train_scores, marker='o', label="train",
drawstyle="steps-post")
ax.plot(ccp_alphas[(-n_trees):], test_scores, marker='o', label="test",
drawstyle="steps-post")
ax.legend()
ax.set_ylim(0,1)
plt.savefig(pp, format='pdf')
plt.show()
clf_best = DecisionTreeClassifier(ccp_alpha=0.01196417)
clf_best.fit(spam_train.iloc[:, :-1], spam_train.iloc[:, -1])
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best,
feature_names=spam_train.columns[:-1],
class_names=[str(tree_clf.classes_[i]) for i in range(2)],
filled=True)
plt.title(f"Spam data, selected tree")
plt.savefig(pp, format='pdf')
plt.show()
#pred=tree_clf.predict(spam_test.iloc[:, :-1])
pred=clf_best.predict(spam_test.iloc[:, :-1])
conf=pd.DataFrame({'pred':pred,'real':spam_test.iloc[:, -1]}).groupby(['pred','real'])['pred'].count().unstack()
print(f"Test accuracy: {(np.matrix(conf.values).trace()/conf.sum().sum())[0,0]:5.2}")
conf
#
Test accuracy: 0.89
real | 0 | 1 |
---|---|---|
pred | ||
0 | 890 | 117 |
1 | 51 | 478 |
(np.matrix(conf.values).trace()/conf.sum().sum())[0,0]
0.890625
https://www.tu-chemnitz.de/mathematik/numa/lehre/ds-2019/Exercises/ps14/housing.csv
housing= pd.read_csv('data/housing.csv',sep=',',decimal='.')
housing_target='median_house_value'
X, y = (housing.drop(columns=[housing_target]), housing[housing_target])
X_train0, X_test0, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.2)
X_train0.shape, X_test0.shape
((16512, 9), (4128, 9))
X.columns
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity'], dtype='object')
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer,StandardScaler
from sklearn.preprocessing import OneHotEncoder
ct_housing= ColumnTransformer(
transformers=[
('', OneHotEncoder(drop='first'), ['ocean_proximity']),
# ('s',StandardScaler(), [0,1,3,4,5,6,7])
],
verbose_feature_names_out=False,
remainder='passthrough'
)
X_train = pd.DataFrame(ct_housing.fit_transform(X_train0),columns=ct_housing.get_feature_names_out(input_features=X_train0.columns))
X_test = pd.DataFrame(ct_housing.transform(X_test0),columns=ct_housing.get_feature_names_out(input_features=X_train0.columns))
X_train0.shape, y_train.shape
((16512, 9), (16512,))
max_depth=3
tree_reg=DecisionTreeRegressor(max_depth=max_depth)
tree_reg.fit(X_train, y_train)
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(tree_reg,
feature_names=X_train.columns,
filled=True)
ax.set_title(f"Housing data, max_depth={max_depth}")
plt.savefig(pp, format='pdf')
plt.show()
path = tree_reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.savefig(pp, format='pdf')
n_trees=8
print(f"alphas = [{ccp_alphas[(-n_trees):]}]")
clfs = []
for ccp_alpha in ccp_alphas[(-n_trees):]:
clf = DecisionTreeRegressor(ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {:_}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]))
alphas = [[0.00000000e+00 6.22752264e+07 1.63123174e+08 1.85336107e+08 5.82095255e+08 7.01359039e+08 1.78920002e+09 4.11494199e+09]] Number of nodes in the last tree is: 1 with ccp_alpha: 4_114_941_985.1079044
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
#ax.set_ylabel("accuracy")
ax.set_title("$R^2$ score vs alpha for training and testing sets")
ax.plot(ccp_alphas[(-n_trees):], train_scores, marker='o', label="train",
drawstyle="steps-post")
ax.plot(ccp_alphas[(-n_trees):], test_scores, marker='o', label="test",
drawstyle="steps-post")
ax.set_ylabel("$R^2$")
ax.legend()
plt.savefig(pp, format='pdf')
plt.show()
clf_best_s = DecisionTreeRegressor(ccp_alpha=1.78920002e+09)
clf_best_s.fit(X_train, y_train)
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best_s,
feature_names=X_train.columns,
filled=True)
plt.title(f"Housing data, selected small tree")
plt.savefig(pp, format='pdf')
plt.show()
clf_best = DecisionTreeRegressor(ccp_alpha=6.22752264e+07)
clf_best.fit(X_train, y_train)
fig, ax_lst = plt.subplots(1, 1,figsize=(20,15))
sklearn.tree.plot_tree(clf_best,
feature_names=X_train.columns,
filled=True)
plt.title(f"Housing data, selected larger tree")
plt.savefig(pp, format='pdf')
plt.show()
from sklearn.metrics import mean_squared_error
print(f"Predict constant: {mean_squared_error([np.mean(y_test)]*len(y_test), y_test):_} ")
print(f"{len(clf_best_s.tree_.value):5} node tree test error: {mean_squared_error(clf_best_s.predict(X_test), y_test):_} ")
print(f"{len(clf_best.tree_.value):5} node tree test error: {mean_squared_error(clf_best.predict(X_test), y_test):_} ")
#mean_squared_error(y_true, y_pred)
Predict constant: 13_008_246_639.623358 5 node tree test error: 7_330_562_846.985989 33 node tree test error: 4_819_060_610.023689
pp.close() #stop saving