import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScalerReference
Import
Data
df = pd.read_csv('../../../delete/insurance.csv')df| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
EDA
- Insurance distribution
fig = px.histogram(df, x="charges",
nbins=80,
marginal="box",
title="Distribution of Charges",
width=800, height=600)
fig.show()- age vs insurance
- mouse over 했을때, sex, region도 같이 나타남
fig = px.scatter(df, x="age", y="charges", color="smoker",
size="bmi", hover_data=["sex", "region"],
title="Age vs Charges (Colored by Smoker, Size by BMI)",
width=800, height=600)
fig.show()- BMI vs insurance
fig = px.scatter(df, x="bmi", y="charges", color="smoker",
hover_data=["age", "sex"],
title="BMI vs Charges by Smoker",
width=800, height=600)
fig.show()- average insurance by regions
fig = px.bar(df, x="region", y="charges", color="region",
title="Average Charges by Region",
# barmode="group",
hover_data=["region"],
text_auto=".2s",
width=800, height=600)
fig.show()Train via Pytorch
categorical_cols = ["sex", "smoker", "region"]for col in categorical_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])- 변환된 데이터(명목형->연속형)
df| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | 1 | 30.970 | 3 | 0 | 1 | 10600.54830 |
| 1334 | 18 | 0 | 31.920 | 0 | 0 | 0 | 2205.98080 |
| 1335 | 18 | 0 | 36.850 | 0 | 0 | 2 | 1629.83350 |
| 1336 | 21 | 0 | 25.800 | 0 | 0 | 3 | 2007.94500 |
| 1337 | 61 | 0 | 29.070 | 0 | 1 | 1 | 29141.36030 |
1338 rows × 7 columns
X = df.drop("charges", axis=1).values
y = df["charges"].valuesscaler = StandardScaler()
X = scaler.fit_transform(X)X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)모델 정의
class InsuranceModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(X.shape[1], 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 32)
self.fc4 = nn.Linear(32, 1)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.relu(self.fc3(x))
x = self.fc4(x) # activation을 넣지 않은 이유는 회귀라서
return xmodel = InsuranceModel()criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)학습
epochs = 1000
for epoch in range(epochs):
model.train()
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
if (epoch+1) % 10 == 0:
val_loss = criterion(model(X_test), y_test).item()
print(f"Epoch [{epoch+1}/{epochs}] Train Loss: {loss.item():.4f}, Test Loss: {val_loss:.4f}")Epoch [10/1000] Train Loss: 27651742.0000, Test Loss: 25587464.0000
Epoch [20/1000] Train Loss: 27516590.0000, Test Loss: 25459822.0000
Epoch [30/1000] Train Loss: 27380854.0000, Test Loss: 25330672.0000
Epoch [40/1000] Train Loss: 27245306.0000, Test Loss: 25200350.0000
Epoch [50/1000] Train Loss: 27110060.0000, Test Loss: 25069516.0000
Epoch [60/1000] Train Loss: 26974262.0000, Test Loss: 24936696.0000
Epoch [70/1000] Train Loss: 26836122.0000, Test Loss: 24801114.0000
Epoch [80/1000] Train Loss: 26695746.0000, Test Loss: 24663712.0000
Epoch [90/1000] Train Loss: 26554166.0000, Test Loss: 24524616.0000
Epoch [100/1000] Train Loss: 26411714.0000, Test Loss: 24384058.0000
Epoch [110/1000] Train Loss: 26267990.0000, Test Loss: 24242138.0000
Epoch [120/1000] Train Loss: 26123302.0000, Test Loss: 24099886.0000
Epoch [130/1000] Train Loss: 25978182.0000, Test Loss: 23958424.0000
Epoch [140/1000] Train Loss: 25833168.0000, Test Loss: 23818406.0000
Epoch [150/1000] Train Loss: 25688288.0000, Test Loss: 23678178.0000
Epoch [160/1000] Train Loss: 25544178.0000, Test Loss: 23538076.0000
Epoch [170/1000] Train Loss: 25401174.0000, Test Loss: 23399844.0000
Epoch [180/1000] Train Loss: 25259298.0000, Test Loss: 23263426.0000
Epoch [190/1000] Train Loss: 25118376.0000, Test Loss: 23127560.0000
Epoch [200/1000] Train Loss: 24978278.0000, Test Loss: 22993640.0000
Epoch [210/1000] Train Loss: 24839798.0000, Test Loss: 22858564.0000
Epoch [220/1000] Train Loss: 24703232.0000, Test Loss: 22722206.0000
Epoch [230/1000] Train Loss: 24569258.0000, Test Loss: 22586444.0000
Epoch [240/1000] Train Loss: 24438282.0000, Test Loss: 22454390.0000
Epoch [250/1000] Train Loss: 24309892.0000, Test Loss: 22324236.0000
Epoch [260/1000] Train Loss: 24184562.0000, Test Loss: 22196420.0000
Epoch [270/1000] Train Loss: 24062134.0000, Test Loss: 22072466.0000
Epoch [280/1000] Train Loss: 23942460.0000, Test Loss: 21953276.0000
Epoch [290/1000] Train Loss: 23825772.0000, Test Loss: 21837140.0000
Epoch [300/1000] Train Loss: 23712180.0000, Test Loss: 21723576.0000
Epoch [310/1000] Train Loss: 23602278.0000, Test Loss: 21612028.0000
Epoch [320/1000] Train Loss: 23495790.0000, Test Loss: 21504478.0000
Epoch [330/1000] Train Loss: 23393010.0000, Test Loss: 21401390.0000
Epoch [340/1000] Train Loss: 23293262.0000, Test Loss: 21302508.0000
Epoch [350/1000] Train Loss: 23196788.0000, Test Loss: 21209882.0000
Epoch [360/1000] Train Loss: 23103690.0000, Test Loss: 21122560.0000
Epoch [370/1000] Train Loss: 23014012.0000, Test Loss: 21039058.0000
Epoch [380/1000] Train Loss: 22927968.0000, Test Loss: 20958230.0000
Epoch [390/1000] Train Loss: 22844084.0000, Test Loss: 20874832.0000
Epoch [400/1000] Train Loss: 22762902.0000, Test Loss: 20799096.0000
Epoch [410/1000] Train Loss: 22683736.0000, Test Loss: 20721684.0000
Epoch [420/1000] Train Loss: 22606852.0000, Test Loss: 20647738.0000
Epoch [430/1000] Train Loss: 22532370.0000, Test Loss: 20578716.0000
Epoch [440/1000] Train Loss: 22460334.0000, Test Loss: 20513776.0000
Epoch [450/1000] Train Loss: 22390742.0000, Test Loss: 20452744.0000
Epoch [460/1000] Train Loss: 22323578.0000, Test Loss: 20394344.0000
Epoch [470/1000] Train Loss: 22259324.0000, Test Loss: 20340286.0000
Epoch [480/1000] Train Loss: 22197456.0000, Test Loss: 20290796.0000
Epoch [490/1000] Train Loss: 22138100.0000, Test Loss: 20243300.0000
Epoch [500/1000] Train Loss: 22080898.0000, Test Loss: 20198882.0000
Epoch [510/1000] Train Loss: 22025808.0000, Test Loss: 20156106.0000
Epoch [520/1000] Train Loss: 21972318.0000, Test Loss: 20115796.0000
Epoch [530/1000] Train Loss: 21921304.0000, Test Loss: 20075504.0000
Epoch [540/1000] Train Loss: 21872798.0000, Test Loss: 20037442.0000
Epoch [550/1000] Train Loss: 21826212.0000, Test Loss: 20002236.0000
Epoch [560/1000] Train Loss: 21781094.0000, Test Loss: 19969742.0000
Epoch [570/1000] Train Loss: 21737458.0000, Test Loss: 19939988.0000
Epoch [580/1000] Train Loss: 21695368.0000, Test Loss: 19912320.0000
Epoch [590/1000] Train Loss: 21654938.0000, Test Loss: 19887668.0000
Epoch [600/1000] Train Loss: 21616120.0000, Test Loss: 19863224.0000
Epoch [610/1000] Train Loss: 21578370.0000, Test Loss: 19841618.0000
Epoch [620/1000] Train Loss: 21541748.0000, Test Loss: 19821636.0000
Epoch [630/1000] Train Loss: 21506546.0000, Test Loss: 19800966.0000
Epoch [640/1000] Train Loss: 21472584.0000, Test Loss: 19783360.0000
Epoch [650/1000] Train Loss: 21439198.0000, Test Loss: 19765890.0000
Epoch [660/1000] Train Loss: 21407492.0000, Test Loss: 19748314.0000
Epoch [670/1000] Train Loss: 21376874.0000, Test Loss: 19730684.0000
Epoch [680/1000] Train Loss: 21347062.0000, Test Loss: 19715512.0000
Epoch [690/1000] Train Loss: 21318214.0000, Test Loss: 19700532.0000
Epoch [700/1000] Train Loss: 21290128.0000, Test Loss: 19685422.0000
Epoch [710/1000] Train Loss: 21262892.0000, Test Loss: 19671186.0000
Epoch [720/1000] Train Loss: 21235850.0000, Test Loss: 19656568.0000
Epoch [730/1000] Train Loss: 21208284.0000, Test Loss: 19643064.0000
Epoch [740/1000] Train Loss: 21180530.0000, Test Loss: 19631202.0000
Epoch [750/1000] Train Loss: 21153766.0000, Test Loss: 19616162.0000
Epoch [760/1000] Train Loss: 21127436.0000, Test Loss: 19601702.0000
Epoch [770/1000] Train Loss: 21101746.0000, Test Loss: 19586666.0000
Epoch [780/1000] Train Loss: 21076040.0000, Test Loss: 19572336.0000
Epoch [790/1000] Train Loss: 21050078.0000, Test Loss: 19558362.0000
Epoch [800/1000] Train Loss: 21024660.0000, Test Loss: 19544498.0000
Epoch [810/1000] Train Loss: 20999442.0000, Test Loss: 19531446.0000
Epoch [820/1000] Train Loss: 20973832.0000, Test Loss: 19519918.0000
Epoch [830/1000] Train Loss: 20948308.0000, Test Loss: 19509186.0000
Epoch [840/1000] Train Loss: 20923002.0000, Test Loss: 19499568.0000
Epoch [850/1000] Train Loss: 20898082.0000, Test Loss: 19491354.0000
Epoch [860/1000] Train Loss: 20873482.0000, Test Loss: 19483140.0000
Epoch [870/1000] Train Loss: 20849306.0000, Test Loss: 19473420.0000
Epoch [880/1000] Train Loss: 20825254.0000, Test Loss: 19462932.0000
Epoch [890/1000] Train Loss: 20800580.0000, Test Loss: 19453928.0000
Epoch [900/1000] Train Loss: 20775602.0000, Test Loss: 19445558.0000
Epoch [910/1000] Train Loss: 20749832.0000, Test Loss: 19434852.0000
Epoch [920/1000] Train Loss: 20724432.0000, Test Loss: 19427166.0000
Epoch [930/1000] Train Loss: 20699818.0000, Test Loss: 19423358.0000
Epoch [940/1000] Train Loss: 20675516.0000, Test Loss: 19414938.0000
Epoch [950/1000] Train Loss: 20650760.0000, Test Loss: 19407758.0000
Epoch [960/1000] Train Loss: 20625938.0000, Test Loss: 19404922.0000
Epoch [970/1000] Train Loss: 20601440.0000, Test Loss: 19404352.0000
Epoch [980/1000] Train Loss: 20577618.0000, Test Loss: 19403746.0000
Epoch [990/1000] Train Loss: 20554310.0000, Test Loss: 19401996.0000
Epoch [1000/1000] Train Loss: 20531334.0000, Test Loss: 19399806.0000
model.eval()InsuranceModel(
(fc1): Linear(in_features=6, out_features=128, bias=True)
(fc2): Linear(in_features=128, out_features=64, bias=True)
(fc3): Linear(in_features=64, out_features=32, bias=True)
(fc4): Linear(in_features=32, out_features=1, bias=True)
(relu): ReLU()
)
with torch.no_grad():
preds = model(X_test)
y_pred = preds.numpy().flatten()
y_true = y_test.numpy().flatten()
print("샘플 예측:", preds[:5].flatten())
print("실제 값:", y_test[:5].flatten())샘플 예측: tensor([ 9970.7197, 6313.5679, 32748.6152, 10189.2920, 30496.3457])
실제 값: tensor([ 9095.0684, 5272.1758, 29330.9824, 9301.8936, 33750.2930])
result_df = pd.DataFrame({"Actual": y_true, "Predicted": y_pred})fig = px.scatter(result_df, x="Actual", y="Predicted",
trendline="ols",
title="Actual vs Predicted Charges",
width=800, height=600)
fig.show()