import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, model_selection, linear_model
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
1. Feature engineering & Feature selection
df_data = pd.read_excel('boston_house_data.xlsx', index_col=0)
df_data.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df_target = pd.read_excel('boston_house_target.xlsx', index_col=0)
df_target.columns = ['Price']
mean_price = df_target['Price'].mean()
df_target['Price'] = df_target['Price'].apply(lambda x : 1 if x > mean_price else 0)
2. Train-Test split
# DataFrame 형식으로 넣어도 됨
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_data, df_target, test_size=0.3, random_state=0)
3. Make Pipeline for feature-transformer (StandardScaler & OneHotEncoder)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numeric_features = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
numeric_transformer = StandardScaler() # cf) RobustScaler
categorical_features = ['CHAS', 'RAD']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
# categories='auto' : just for ignoring warning messages
# handle_unknown='ignore' :
# whether to raise an error or ignore if an unknown categorical feature is present during transform
# an unknown category is encountered during transform,
# the resulting one-hot encoded columns for this feature will be all zeros
preprocessor = ColumnTransformer(
# List of (name, transformer, column(s))
transformers=[('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
4. Pipeline usage - Preprocessing-only (fit & transform)
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
preprocessor_pipe.fit(x_train)
x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)
'''
위에서 categorical_features 리스트에 포함시킨 열 중 숫자가 아닌 텍스트(문자열)로 이루어진 열이 있을 경우,
.transform() 함수 실행 결과로 만들어진 변수의 타입이 np.array가 아닌 csr_matrix일 수 있습니다.
그 경우에는 .tranform() 함수 실행 직후 .todense() 함수를 추가로 실행해주시면 됩니다.
ex) preprocessor_pipe.transform(x_train).todense()
'''
5. Apply Preprocessed Data to Model
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200, random_state=0)
model.fit(x_train_transformed, y_train) # <- x_train_transformed (not x_train)
accuracy = model.score(x_test_transformed, y_test)
print("model score:", round(accuracy, 4))
'멋쟁이 사자처럼 AI SCHOOL 5기 > Today I Learned' 카테고리의 다른 글
[5주차 총정리] 불균형(Imbalanced) 데이터 처리 (SMOTE, oversampling) (0) | 2022.04.14 |
---|---|
[5주차 총정리] 결측값 시각화 (Missingno) (0) | 2022.04.14 |
[5주차 총정리] Model Stacking을 위한 vecstack (0) | 2022.04.13 |
[5주차 총정리] 훈련모델 및 Scaler 저장 (joblib, pickle) (0) | 2022.04.13 |
[5주차 총정리] 비지도 학습(Unsupervised) 모델 시각화 (K-Means, PCA) (0) | 2022.04.13 |