본문으로 바로가기
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
# from sklearn.inspection import permutation_importance 
# if you want to use this func., upgrade the scikit-learn to 0.23.1 (https://j.mp/334YxRi)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

 

Gradient Boosting Regression

- 원본 명세서 출력

print(boston.DESCR)   # print로 출력해야 나옴

# 데이터셋 불러오기
diabetes = datasets.load_diabetes()

# X, y 데이터로 나누기
X, y = diabetes.data, diabetes.target

# 원래 2차원 이상의 np.array 데이터 타입으로 변형해야 하지만, 이미 데이터셋이 그렇게 형성됨

# train, test set으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

# 모델 생성
params = {'n_estimators': 2000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}         
reg = ensemble.GradientBoostingRegressor(**params)

# 모델 훈련
reg.fit(X_train, y_train)

# 모델 예측 및 평가
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

 


시각화 (Visualization)

1. Deviance graph

- Regression(회귀)만 가능

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

# staged_predict : 매 boosting 단계마다 predict한 것 꺼내주는 것
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

 

2. Feature Importance

- Tree 기반의 모델만 가능

- Permutation Feature Importance : 특정 feature를 안 썼을 때, 이것이 성능 손실에 얼마만큼의 영향을 주는지를 통해 그 feature의 중요도를 파악하는 방법 (재학습시킬 필요가 없다)

# if you want to use this func., upgrade the scikit-learn to 0.23.1 (https://j.mp/334YxRi)
from sklearn.inspection import permutation_importance

feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)

# make importances relative to max importance
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])  #정렬
plt.title('Feature Importance (MDI)')

result = permutation_importance(reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T, vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])
plt.title("Permutation Importance (test set)")

fig.tight_layout()
plt.show()