본문으로 바로가기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, model_selection, linear_model
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

 

1. Feature engineering & Feature selection

df_data = pd.read_excel('boston_house_data.xlsx', index_col=0)
df_data.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

df_target = pd.read_excel('boston_house_target.xlsx', index_col=0)
df_target.columns = ['Price']

mean_price = df_target['Price'].mean()
df_target['Price'] = df_target['Price'].apply(lambda x : 1 if x > mean_price else 0)

 

 

2. Train-Test split

# DataFrame 형식으로 넣어도 됨
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_data, df_target, test_size=0.3, random_state=0)

 

 

3. Make Pipeline for feature-transformer (StandardScaler & OneHotEncoder)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
numeric_transformer = StandardScaler() # cf) RobustScaler

categorical_features = ['CHAS', 'RAD']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 
# categories='auto' : just for ignoring warning messages
# handle_unknown='ignore' : 
# whether to raise an error or ignore if an unknown categorical feature is present during transform
# an unknown category is encountered during transform,
# the resulting one-hot encoded columns for this feature will be all zeros

preprocessor = ColumnTransformer(
    # List of (name, transformer, column(s))
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

 

 

4. Pipeline usage - Preprocessing-only (fit & transform)

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
preprocessor_pipe.fit(x_train)

x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)

'''
위에서 categorical_features 리스트에 포함시킨 열 중 숫자가 아닌 텍스트(문자열)로 이루어진 열이 있을 경우,
.transform() 함수 실행 결과로 만들어진 변수의 타입이 np.array가 아닌 csr_matrix일 수 있습니다.
그 경우에는 .tranform() 함수 실행 직후 .todense() 함수를 추가로 실행해주시면 됩니다.

ex) preprocessor_pipe.transform(x_train).todense()
'''

 

 

5. Apply Preprocessed Data to Model

from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200, random_state=0)

model.fit(x_train_transformed, y_train) # <- x_train_transformed (not x_train)

accuracy = model.score(x_test_transformed, y_test)
print("model score:", round(accuracy, 4))