1.Model Capacity
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
Elec = pd.read_csv(url)
Elec.info()
Elec.head()
#산점도(surface_area vs. electricity)
sns.scatterplot(Elec['surface_area'], Elec['electricity'])
plt.show()
1-1. 1차 모델 시각화
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
line_kws = {'color':'red'},
scatter_kws = {'edgecolor' : 'white'})
plt.xlim(505, 820)
plt.show()
1-2. 2차 모델 시각화
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
line_kws = {'color':'blue'},
scatter_kws = {'edgecolor' : 'white'},
order = 2)
plt.xlim(505, 820)
plt.show()
1-3. 5차 모델 시각화
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
line_kws = {'color':'green'},
scatter_kws = {'edgecolor' : 'white'},
order = 5)
plt.xlim(505, 820)
plt.show()
1-4. 9차 모델 시각화
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
line_kws = {'color':'orange'},
scatter_kws = {'edgecolor':'white'},
order = 9)
plt.xlim(505, 820)
plt.ylim(50, 450)
plt.show()
1-5. 4개 모델 비교 시각화
sns.regplot(x = 'surface_area', y = 'electricity',
data = Elec,
line_kws = {'color':'red'})
sns.regplot(x = 'surface_area', y = 'electricity',
data = Elec,
line_kws = {'color':'blue'}, order = 2)
sns.regplot(x = 'surface_area', y = 'electricity',
data = Elec,
line_kws = {'color':'green'}, order = 5)
sns.regplot(x = 'surface_area', y = 'electricity',
data = Elec,
line_kws = {'color':'orange'}, order = 9,
scatter_kws = {'color':'gray', 'edgecolor':'white'})
plt.xlim(505, 820)
plt.ylim(50, 450)
plt.xticks(rotation = 35)
plt.yticks(rotation = 90)
plt.show()
2.Training Error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
Elec = pd.read_csv(url)
Elec.info()
2-1. 1차 모델 Training Error
X_train = Elec[['surface_area']]
y_train = Elec['electricity']
X_train.shape, y_train.shape
((768, 1), (768,))
from sklearn.linear_model import LinearRegression
Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
#모델 정보(학습결과) 확인
print(Model_1.coef_)
print(Model_1.intercept_)
[-0.75387157]
729.4538243006992
# y_hat(예측값) 생성
y_hat_1 = Model_1.predict(X_train)
len(y_hat_1)
768
# MSE(Mean Squared Error) 계산
TR_Err_1 = np.mean((y_train - y_hat_1) ** 2)
TR_Err_1
5763.983779426347
2-2. 5차 모델 Training Error
# X 다항차수 변환
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5 = poly.fit_transform(X_train)
PX_5
X_train.shape, PX_5.shape
((768, 1), (768, 5))
#5차 모델 생성
from sklearn.linear_model import LinearRegression
Model_5 = LinearRegression()
Model_5.fit(PX_5, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
#모델 정보(학습결과) 확인
np.set_printoptions(suppress = True, precision = 10)
print(Model_5.coef_)
print(Model_5.intercept_)
[-0.0003155148 -0.1029296835 0.0003787616 -0.0000005032 0.0000000002]
2906.221625380881
#y_hat(예측값) 생성
PX_5_pred = poly.fit_transform(X_train)
y_hat_5 = Model_5.predict(PX_5_pred)
y_hat_5.shape
(768,)
#MSE(Mean Squared Error) 계산
TR_Err_5 = np.mean((y_train - y_hat_5) ** 2)
TR_Err_5
4177.726328606075
2-3. 9차 모델 Training Error
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9 = poly.fit_transform(X_train)
X_train.shape, PX_9.shape
((768, 1), (768, 9))
from sklearn.linear_model import LinearRegression
Model_9 = LinearRegression()
Model_9.fit(PX_9, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print(Model_9.coef_)
print(Model_9.intercept_)
[ 0. 0. 0. 0. 0. 0. -0. 0. -0.]
-440.08258373871365
PX_9_pred = poly.fit_transform(X_train)
y_hat_9 = Model_9.predict(PX_9_pred)
y_hat_9.shape
(768,)
TR_Err_9 = np.mean((y_train - y_hat_9) ** 2)
TR_Err_9
4086.7199908150374
2-4. 3개 모델 Training Error 비교
print('1차 모델 : ', TR_Err_1)
print('5차 모델 : ', TR_Err_5)
print('9차 모델 : ', TR_Err_9)
1차 모델 : 5763.983779426347
5차 모델 : 4177.726328606075
9차 모델 : 4086.7199908150374
2-5.잔차(Residual) 시각화
#1차 모델
sns.residplot(x = 'surface_area',
y = 'electricity',
data = Elec,
order = 1,
scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()
# 5차 모델
sns.residplot(x = 'surface_area',
y = 'electricity',
data = Elec,
order = 5,
scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()
#9차 모델
sns.residplot(x = 'surface_area',
y = 'electricity',
data = Elec,
order = 9,
scatter_kws={'edgecolor':'white'})
plt.ylim(-300, 300)
plt.show()
3.Testing Error
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
Elec = pd.read_csv(url)
Elec.shape
(768, 9)
(1) DataFrame Split
from sklearn.model_selection import train_test_split
TR_Elec, TE_Elec = train_test_split(Elec, test_size = 0.2,
random_state = 2045)
TR_Elec.shape, TE_Elec.shape
((614, 9), (154, 9))
#80% TR_Elec DataFrame
TR_Elec.head()
#20% TE_Elec DataFrame
TE_Elec.head()
(2) Array Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Elec[['surface_area']],
Elec['electricity'],
test_size = 0.2,
random_state = 2045)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((614, 1), (614,), (154, 1), (154,))
X_train.head() #80% X_train Array
y_train.head() #80% y_train Array
(3) Distribution Visualization
sns.scatterplot(TR_Elec['surface_area'], TR_Elec['electricity'])
plt.show()
sns.scatterplot(TE_Elec['surface_area'], TE_Elec['electricity'])
plt.show()
3-1.1차 모델 Testing Error
from sklearn.linear_model import LinearRegression
Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_hat_1 = Model_1.predict(X_test)
y_hat_1.shape
(154,)
from sklearn.metrics import mean_squared_error
TE_Err_1 = mean_squared_error(y_test, y_hat_1)
TE_Err_1
6044.176547629271
3-2.5차 모델 Testing Error
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression
Model_5 = LinearRegression()
Model_5.fit(PX_5_TR, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
PX_5_TE = poly.fit_transform(X_test)
y_hat_5 = Model_5.predict(PX_5_TE)
#Test_Data로 MSE(Mean Squared Error) 계산
from sklearn.metrics import mean_squared_error
TE_Err_5 = mean_squared_error(y_test, y_hat_5)
TE_Err_5
4330.604566409499
3-3. 9차 모델 Testing Error
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression
Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
PX_9_TE = poly.fit_transform(X_test)
y_hat_9 = Model_9.predict(PX_9_TE)
from sklearn.metrics import mean_squared_error
TE_Err_9 = mean_squared_error(y_test, y_hat_9)
TE_Err_9
4238.689067137633
3-4. 3개 모델 Testing Error 비교
print('1차 모델 : ', TE_Err_1)
print('5차 모델 : ', TE_Err_5)
print('9차 모델 : ', TE_Err_9)
1차 모델 : 6044.176547629271
5차 모델 : 4330.604566409499
9차 모델 : 4238.689067137633
4.Validation Approach
import pandas as pd
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
Elec = pd.read_csv(url)
Elec.info()
from sklearn.model_selection import train_test_split
X_remain, X_test, y_remain, y_test = train_test_split(Elec[['surface_area']],
Elec['electricity'],
test_size = int(len(Elec) * 0.2), #20% Test_Data(153)
random_state = 2045)
print(X_remain.shape, y_remain.shape)
print(X_test.shape, y_test.shape)
(615, 1) (615,)
(153, 1) (153,)
X_train, X_valid, y_train, y_valid = train_test_split(X_remain, y_remain,
test_size = int(len(Elec) * 0.2),
random_state = 2045)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)
(462, 1) (462,)
(153, 1) (153,)
(153, 1) (153,)
4-1. 5차 모델 Validation Error
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5_TR = poly.fit_transform(X_train)
from sklearn.linear_model import LinearRegression
Model_5 = LinearRegression()
Model_5.fit(PX_5_TR, y_train)
# Validation_Data로 y_hat(예측값) 생성 및 MSE 계산
PX_5_VD = poly.fit_transform(X_valid)
y_hat_5 = Model_5.predict(PX_5_VD)
from sklearn.metrics import mean_squared_error
MSE_5 = mean_squared_error(y_valid, y_hat_5)
MSE_5
4136.431259340616
4-2. 9차 모델 Validation Error
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)
Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)
PX9_valid = poly.fit_transform(X_valid)
y_hat_9 = Model_9.predict(PX9_valid)
MSE_9 = mean_squared_error(y_valid, y_hat_9)
MSE_9
3955.9733124913614
4-3. 2개 모델 Validation Error 비교
print('5차 모델 MSE_5 : ', MSE_5)
print('9차 모델 MSE_9 : ', MSE_9)
5차 모델 MSE_5 : 4136.431259340616
9차 모델 MSE_9 : 3955.9733124913614
4-4.최종 9차 모델을 Test_Data에 적용
PX9_TE = poly.fit_transform(X_test)
mean_squared_error(y_test, Model_9.predict(PX9_TE))
4220.885732107714
회귀분석(Regression Analysis) 4 (0) | 2022.06.07 |
---|---|
회귀분석(Regression Analysis) 3 (2) | 2022.06.07 |
회귀분석(Regression Analysis) 2 (0) | 2022.06.07 |
회귀분석(Regression Analysis) 1 (1) | 2022.06.07 |
경사 하강법(Gradient Descent) (0) | 2022.06.06 |