상세 컨텐츠

본문 제목

Model Validation

인공지능/머신러닝

by 2^7 2022. 6. 6. 17:25

본문

1.Model Capacity

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'

Elec = pd.read_csv(url)

Elec.info()

Elec.head()

#산점도(surface_area vs. electricity)

sns.scatterplot(Elec['surface_area'], Elec['electricity'])
plt.show()

1-1. 1차 모델 시각화

sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'red'},
            scatter_kws = {'edgecolor' : 'white'})
plt.xlim(505, 820)
plt.show()

1-2. 2차 모델 시각화

sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'blue'},
            scatter_kws = {'edgecolor' : 'white'},
            order = 2)
plt.xlim(505, 820)
plt.show()

1-3. 5차 모델 시각화

sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'green'},
            scatter_kws = {'edgecolor' : 'white'},
            order = 5)
plt.xlim(505, 820)
plt.show()

1-4. 9차 모델 시각화

sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'orange'},
            scatter_kws = {'edgecolor':'white'},
            order = 9)
plt.xlim(505, 820)
plt.ylim(50, 450)
plt.show()

1-5. 4개 모델 비교 시각화

sns.regplot(x = 'surface_area', y = 'electricity', 
            data = Elec, 
            line_kws = {'color':'red'})
sns.regplot(x = 'surface_area', y = 'electricity', 
            data = Elec, 
            line_kws = {'color':'blue'}, order = 2)
sns.regplot(x = 'surface_area', y = 'electricity', 
            data = Elec, 
            line_kws = {'color':'green'}, order = 5)
sns.regplot(x = 'surface_area', y = 'electricity', 
            data = Elec, 
            line_kws = {'color':'orange'}, order = 9,
            scatter_kws = {'color':'gray', 'edgecolor':'white'})
plt.xlim(505, 820)
plt.ylim(50, 450)
plt.xticks(rotation = 35)
plt.yticks(rotation = 90)
plt.show()


2.Training Error

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'

Elec = pd.read_csv(url)

Elec.info()

2-1. 1차 모델 Training Error

X_train = Elec[['surface_area']]
y_train = Elec['electricity']

X_train.shape, y_train.shape

((768, 1), (768,))

from sklearn.linear_model import LinearRegression

Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#모델 정보(학습결과) 확인
print(Model_1.coef_)
print(Model_1.intercept_)

[-0.75387157]

729.4538243006992

# y_hat(예측값) 생성
y_hat_1 = Model_1.predict(X_train)

len(y_hat_1)

768

# MSE(Mean Squared Error) 계산

TR_Err_1 = np.mean((y_train - y_hat_1) ** 2)
TR_Err_1

5763.983779426347


2-2. 5차 모델 Training Error

# X 다항차수 변환

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5 = poly.fit_transform(X_train)
PX_5

X_train.shape, PX_5.shape

((768, 1), (768, 5))

#5차 모델 생성
from sklearn.linear_model import LinearRegression

Model_5 = LinearRegression()
Model_5.fit(PX_5, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#모델 정보(학습결과) 확인
np.set_printoptions(suppress = True, precision = 10)

print(Model_5.coef_)
print(Model_5.intercept_)

[-0.0003155148 -0.1029296835 0.0003787616 -0.0000005032 0.0000000002]

2906.221625380881

#y_hat(예측값) 생성
PX_5_pred = poly.fit_transform(X_train)

y_hat_5 = Model_5.predict(PX_5_pred)

y_hat_5.shape

(768,)

#MSE(Mean Squared Error) 계산

TR_Err_5 = np.mean((y_train - y_hat_5) ** 2)
TR_Err_5

4177.726328606075


2-3. 9차 모델 Training Error

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9 = poly.fit_transform(X_train)
X_train.shape, PX_9.shape

((768, 1), (768, 9))

from sklearn.linear_model import LinearRegression

Model_9 = LinearRegression()
Model_9.fit(PX_9, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

print(Model_9.coef_)
print(Model_9.intercept_)

[ 0. 0. 0. 0. 0. 0. -0. 0. -0.]

-440.08258373871365

PX_9_pred = poly.fit_transform(X_train)

y_hat_9 = Model_9.predict(PX_9_pred)

y_hat_9.shape

(768,)

TR_Err_9 = np.mean((y_train - y_hat_9) ** 2)
TR_Err_9

4086.7199908150374


2-4. 3개 모델 Training Error 비교

print('1차 모델 : ', TR_Err_1)
print('5차 모델 : ', TR_Err_5)
print('9차 모델 : ', TR_Err_9)

1차 모델 : 5763.983779426347

5차 모델 : 4177.726328606075

9차 모델 : 4086.7199908150374


2-5.잔차(Residual) 시각화

#1차 모델
sns.residplot(x = 'surface_area', 
              y = 'electricity', 
              data = Elec,
              order = 1, 
              scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()

# 5차 모델
sns.residplot(x = 'surface_area', 
              y = 'electricity', 
              data = Elec,
              order = 5,
              scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()

#9차 모델
sns.residplot(x = 'surface_area', 
              y = 'electricity', 
              data = Elec,
              order = 9,
              scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()


3.Testing Error

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'

Elec = pd.read_csv(url)

Elec.shape

(768, 9)

(1) DataFrame Split

from sklearn.model_selection import train_test_split

TR_Elec, TE_Elec = train_test_split(Elec, test_size = 0.2, 
                                    random_state = 2045)
TR_Elec.shape, TE_Elec.shape

((614, 9), (154, 9))

#80% TR_Elec DataFrame

TR_Elec.head()

#20% TE_Elec DataFrame

TE_Elec.head()

(2) Array Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Elec[['surface_area']], 
                                                    Elec['electricity'], 
                                                    test_size = 0.2, 
                                                    random_state = 2045)
                                                    
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((614, 1), (614,), (154, 1), (154,))

X_train.head()  #80% X_train Array

 

y_train.head()  #80% y_train Array

(3) Distribution Visualization

sns.scatterplot(TR_Elec['surface_area'], TR_Elec['electricity'])
plt.show()

sns.scatterplot(TE_Elec['surface_area'], TE_Elec['electricity'])
plt.show()


3-1.1차 모델 Testing Error

from sklearn.linear_model import LinearRegression

Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_hat_1 = Model_1.predict(X_test)

y_hat_1.shape

(154,)

from sklearn.metrics import mean_squared_error

TE_Err_1 = mean_squared_error(y_test, y_hat_1)
TE_Err_1

6044.176547629271


3-2.5차 모델 Testing Error

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression

Model_5 = LinearRegression()
Model_5.fit(PX_5_TR, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

PX_5_TE = poly.fit_transform(X_test)

y_hat_5 = Model_5.predict(PX_5_TE)
#Test_Data로 MSE(Mean Squared Error) 계산

from sklearn.metrics import mean_squared_error

TE_Err_5 = mean_squared_error(y_test, y_hat_5)
TE_Err_5

4330.604566409499


3-3. 9차 모델 Testing Error

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression

Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

PX_9_TE = poly.fit_transform(X_test)

y_hat_9 = Model_9.predict(PX_9_TE)
from sklearn.metrics import mean_squared_error

TE_Err_9 = mean_squared_error(y_test, y_hat_9)
TE_Err_9

4238.689067137633


3-4. 3개 모델 Testing Error 비교

print('1차 모델 : ', TE_Err_1)
print('5차 모델 : ', TE_Err_5)
print('9차 모델 : ', TE_Err_9)

1차 모델 : 6044.176547629271

5차 모델 : 4330.604566409499

9차 모델 : 4238.689067137633


4.Validation Approach

import pandas as pd

url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'

Elec = pd.read_csv(url)

Elec.info()
from sklearn.model_selection import train_test_split

X_remain, X_test, y_remain, y_test = train_test_split(Elec[['surface_area']], 
                                                      Elec['electricity'], 
                                                      test_size = int(len(Elec) * 0.2),  #20% Test_Data(153)
                                                      random_state = 2045)

print(X_remain.shape, y_remain.shape)
print(X_test.shape, y_test.shape)

(615, 1) (615,)

(153, 1) (153,)

X_train, X_valid, y_train, y_valid = train_test_split(X_remain, y_remain, 
                                                      test_size = int(len(Elec) * 0.2), 
                                                      random_state = 2045)

print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(462, 1) (462,)

(153, 1) (153,)

(153, 1) (153,)


4-1. 5차 모델 Validation Error

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression

Model_5 = LinearRegression()
Model_5.fit(PX_5_TR, y_train)
# Validation_Data로 y_hat(예측값) 생성 및 MSE 계산

PX_5_VD = poly.fit_transform(X_valid)

y_hat_5 = Model_5.predict(PX_5_VD)
from sklearn.metrics import mean_squared_error

MSE_5 = mean_squared_error(y_valid, y_hat_5)
MSE_5

4136.431259340616


4-2. 9차 모델 Validation Error

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)
Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)
PX9_valid = poly.fit_transform(X_valid)

y_hat_9 = Model_9.predict(PX9_valid)
MSE_9 = mean_squared_error(y_valid, y_hat_9)
MSE_9

3955.9733124913614


4-3. 2개 모델 Validation Error 비교

print('5차 모델 MSE_5  : ', MSE_5)
print('9차 모델 MSE_9  : ', MSE_9)

5차 모델 MSE_5 : 4136.431259340616

9차 모델 MSE_9 : 3955.9733124913614


4-4.최종 9차 모델을 Test_Data에 적용

PX9_TE = poly.fit_transform(X_test)

mean_squared_error(y_test, Model_9.predict(PX9_TE))

4220.885732107714

728x90

관련글 더보기