Certificate/data analytics-Google

[XGBoost] Python

Olivia-BlackCherry 2023. 8. 8. 15:59

1. 라이브러리

import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

from xgboost import XGBClassifier
from xgboost import plot_importance

 

 

2. 데이터

 

 

3. feature engineering

airline_data_dummies = pd.get_dummies(airline_data, columns=['satisfaction','Customer Type','Type of Travel','Class'])

 

 

4. data split

y = airline_data_dummies['satisfaction_satisfied']
X = airline_data_dummies.drop(['satisfaction_satisfied','satisfaction_dissatisfied'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

 

 

5. xgboost 모델 만들기

xgb = XGBClassifier(objective='binary:logistic', random_state=0)

 

 

6. GridSearchCV 교차검정으로 최적의 하이퍼파라미터 구하기

cv_params = {'max_depth': [4, 6],
              'min_child_weight': [3, 5],
              'learning_rate': [0.1, 0.2, 0.3],
              'n_estimators': [5,10,15],
              'subsample': [0.7],
              'colsample_bytree': [0.7]
              }
              
scoring = {'accuracy', 'precision', 'recall', 'f1'}

xgb_cv = GridSearchCV(xgb,
                      cv_params,
                      scoring = scoring,
                      cv = 5,
                      refit = 'f1'
                     )

 

 

7. 학습시키기

xgb_cv = xgb_cv.fit(X_train, y_train)

 

 

8. 최적의 파라미터

xgb_cv.best_params_

{'colsample_bytree': 0.7, 'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 15, 'subsample': 0.7}

 

 

 

9. 피클

pickle.dump(xgb_cv, open('xgb_cv.sav', 'wb'))

 

 

10. 평가

1) 예상값

y_pred = xgb_cv.predict(X_test)

 

 

2) 점수

ac_score = metrics.accuracy_score(y_test, y_pred)
print('accuracy score:', ac_score)

pc_score = metrics.precision_score(y_test, y_pred)
print('precision score:', pc_score)

rc_score = metrics.recall_score(y_test, y_pred)
print('recall score:', rc_score)

f1_score = metrics.f1_score(y_test, y_pred)
print('f1 score:', f1_score)
accuracy score: 0.9340314136125655
precision score: 0.9465036952814099
recall score: 0.9327170868347339
f1 score: 0.9395598194130925

 

 

3) 혼동행렬

cm = metrics.confusion_matrix(y_test, y_pred)

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xgb_cv.classes_)
disp.plot()

 

 

 

11. 피처 중요도

plot_importance(xgb_cv.best_estimator_)

XGboost