목차
1. 라이브러리
# Standard operational package imports
import numpy as np
import pandas as pd
# Important imports for modeling and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import sklearn.metrics as metrics
# Visualization package imports
import matplotlib.pyplot as plt
import seaborn as sns
2. 데이터
비행기 탑승 후 만족도
3. EDA
1) dtypes
df_original.dtypes
2) unique()
df_original['Class'].unique()
3) value_counts()
df_original['satisfaction'].value_counts(dropna=false)
4) isnull()
df_original.isnull().sum()
5) shape
df_original.shape
4. Encoding (object to numeric)
1) map()
df_subset['Class'] = df_subset['Class'].map({"Business": 3, "Eco Plus": 2, "Eco": 1})
df_subset의 class 값은 Business, Eco Plus, Eco가 있다. 이를 numeric한 숫자로 바꾸고 싶다. 이를 위해 map 함수를 쓴다. 딕셔너리에 원하는 값을 할당한다. 그러면 map함수를 이용해 차례로 값이 매핑된다.
레이블 값도 numeric하게 바꾼다.
df_subset['satisfaction'] = df_subset['satisfaction'].map({"satisfied":1, "dissatisfied":1})
2) get_dummies()
나머지 object들은 더미로 처리한다.
df_subset= pd.get_dummies(df_subset, drop_first=True)
5. data split
X= df_subset.copy()
X= X.drop(columns={'satisfaction'}, axis=1)
y= df_subset['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
가능하면 df_subset을 copy하여 사용한다.
6. 모델링
decision_tree= DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)
dt_pred= decision_tree.predict(X_test)
7. 평가
print(metrics.accuracy_score(y_test, dt_pred))
print(metrics.precision_score(y_test, dt_pred))
print(metrics.recall_score(y_test, dt_pred))
print(metrics.f1_score(y_test, dt_pred))
0.9339861608797726
0.9392641838976844
0.9402707275803722
0.9397671862228361
너무 평가가 좋다. 과적합일 수도 있다.
- confusion matrix
cm= metrics.confusion_matrix(y_test, dt_pred, labels=decision_tree.classes_)
cm
array([[13564, 1078],
[ 1059, 16671]])
disp= metrics.ConfusionMatrixDisplay(confusion_matrix= cm, display_labels = decision_tree.classes_)
disp.plot()
8. 결정트리 시각화
lt.figure(figsize=(20, 12))
plot_tree(decision_tree, max_depth=3, fontsize=10, feature_names=X.columns)
9. feature importance
분류에 가장 큰 영향을 미치는 피처를 알고 싶다면 feature_importance_ 를 이용한다.
1) columns
X.columns
2) feature_importance_
중요도 나열
importances= decision_tree.feature_importances_
순서대로 나열
forest_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
시각화
fig, ax= plt.subplots()
forest_importances.plot.bar(ax=ax)
10. 하이퍼파라미터 튜닝
tree_para = {'max_depth':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,30,40,50],
'min_samples_leaf': [2,3,4,5,6,7,8,9, 10, 15, 20, 50]}
scoring = {'accuracy', 'precision', 'recall', 'f1'}
tuned_decision_tree = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(tuned_decision_tree,
tree_para,
scoring = scoring,
cv=5,
refit="f1")
clf.fit(X_train, y_train)
1) 가장 좋은 하이퍼파라미터 조합은?
clf.best_estimator
max_depth=18, min_samples_leaf=2, min_samples_split=2
2) 가장 높은 평균 validation 점수는?
clf.best_score_
0.9454 --->f1인 경우
3) 표로 만들자
results = pd.DataFrame(columns=['Model', 'F1', 'Recall', 'Precision', 'Accuracy'])
def make_results(model_name, model_object):
cv_results = pd.DataFrame(model_object.cv_results_)
best_estimator_results = cv_results.iloc[cv_results['mean_test_f1'].idxmax(), :]
f1 = best_estimator_results.mean_test_f1
recall = best_estimator_results.mean_test_recall
precision = best_estimator_results.mean_test_precision
accuracy = best_estimator_results.mean_test_accuracy
table = pd.DataFrame()
table = table.append({'Model': model_name,
'F1': f1,
'Recall': recall,
'Precision': precision,
'Accuracy': accuracy},
ignore_index=True)
return table
result_table = make_results("Tuned Decision Tree", clf)
result_table
4) tree plot
plt.figure(figsize=(20,12))
plot_tree(clf.best_estimator_, max_depth=3, fontsize=10, feature_names=X.columns)
5) feature_importances_
importances = clf.best_estimator_.feature_importances_
forest_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax);