MSEloss 와 BCEloss 비교

손실함수의 모양비교

import torch
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)
X=torch.linspace(-1,1,2000).reshape(2000,1)
w0=-1.0
w1=5.0
u=w0+X*w1
v=torch.exp(u)/(1+torch.exp(u))
y=torch.bernoulli(v)
plt.scatter(X,y,alpha=0.01)
plt.plot(X,v)
[<matplotlib.lines.Line2D at 0x7f2a3dfc1640>]
_w0=np.arange(-10,3,0.05) # start=-10, stop=3, step=0.05
_w1=np.arange(-1,10,0.05)
_w0.shape,_w1.shape
((260,), (220,))
_w0,_w1=np.meshgrid(_w0,_w1,indexing='ij') # grid를 array로 만들기
  • 격자 그리드 만드는 numpy의 meshgrid 함수
  • Meshind의 indexing
    • Cartesian: x=열, y=행
    • Matrix: i=행, j=열
  • Meshind의 sparse
    • 메모리 아끼는 작업
_w0.shape,_w1.shape
((260, 220), (260, 220))
_w0=_w0.reshape(-1) # x,y를 곱하기
_w1=_w1.reshape(-1)
_w0.shape,_w1.shape
((57200,), (57200,))
def lossfn_crossenp(w0,w1): 
    yhat=torch.exp( w0+w1*X) / (1+torch.exp( w0+w1*X))
    loss= - torch.mean (y*torch.log(yhat)+(1-y)*torch.log(1-yhat)) 
    return loss.tolist()
def lossfn_mse(w0,w1): 
    yhat=torch.exp( w0+w1*X) / (1+torch.exp( w0+w1*X))
    loss= torch.mean((y-yhat)**2) 
    return loss.tolist()
_l1=list(map(lossfn_crossenp,_w0,_w1))
_l2=list(map(lossfn_mse,_w0,_w1))
fig=plt.figure()
ax1=fig.add_subplot(1,2,1,projection='3d')
ax2=fig.add_subplot(1,2,2,projection='3d')
ax1.elev=15
ax2.elev=15
ax1.azim=75
ax2.azim=75
fig.set_figheight(15)
fig.set_figwidth(15)
ax1.scatter(_w0,_w1,_l1,s=0.01)
ax2.scatter(_w0,_w1,_l2,s=0.01)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f2a3dc3afd0>
_w0[np.argmin(_l1)],_w1[np.argmin(_l1)] # 실제 값이랑 비슷
(-0.9999999999998721, 5.150000000000006)
_w0[np.argmin(_l2)],_w1[np.argmin(_l2)] # 실제 값이랑 비슷
(-0.9999999999998721, 5.100000000000005)
ax1.scatter(_w0[np.argmin(_l1)],_w1[np.argmin(_l1)],np.min(_l1),s=200,marker='*')
ax2.scatter(_w0[np.argmin(_l2)],_w1[np.argmin(_l2)],np.min(_l2),s=200,marker='*')
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f2a3dc3af10>
  • argmin : 최소값의 색인 위치
  • argmax : 최대값의 색인 위치
fig
  • crossentrop(왼)이 mse(오른)보다 최소값 찾기 쉬워보이는 모형
  • convex란 아래로 볼록한 2차 곡선 모양을 뜻함
  • 오른쪽꺼 단면을 자르게 되면 평면이 나오는 현상이 나올 수도 있다.(오른쪽 대각선 부분)
  • logistic의 경우 mse가 아니라(local minimun을 만나지 않기 위해) Bionary Cross Entropy로 하면 convex를 만날 수 있다.
  • MLE 이기 때문에 BCE를 써야 한다고 말할 수도 있다.

아키텍처, 옵티마이저

l1=torch.nn.Linear(in_features=1,out_features=1,bias=True)
a1=torch.nn.Sigmoid()
net=torch.nn.Sequential(l1,a1) 
optimizer=torch.optim.SGD(net.parameters(),lr=0.05) 

$$H(x)=sigmoid(Wx+b)=\frac{1}{1+e^{Wx+b}}=\sigma(Wx+b)$$

초기값 $(w_0,w_1)=(-3,-1)$을 대입하고 수렴과정을 animation으로 관찰하자.

- 파라메터 초기값 $(w_0,w_1)=(-3,-1)$로 설정

l1.bias.data, l1.weight.data
(tensor([0.0331]), tensor([[-0.1853]]))
l1.bias.data=torch.tensor([-3.0])
l1.weight.data=torch.tensor([[-1.0]]) 
l1.bias.data, l1.weight.data
(tensor([-3.]), tensor([[-1.]]))

- BCEloss를 이용하여 학습+기록

w0_bce=[] 
w1_bce=[]
loss_bce=[]
for epoc in range(1000): 
    ## 1 
    yhat=net(X) 
    ## 2 
    loss= - torch.mean(y*torch.log(yhat) + (1-y)*torch.log(1-yhat)) 
    ## 3 
    loss.backward()
    ## 4 
    optimizer.step()
    net.zero_grad()
    ## 5 
    if epoc%20 == 0: 
        w0_bce.append(l1.bias.data.item())
        w1_bce.append(l1.weight.data.item())
        loss_bce.append(loss.item())
l1.bias.data,l1.weight.data
(tensor([-0.6726]), tensor([[3.3696]]))

- 파라메터 초기값 $(w_0,w_1)=(-3,-1)$로 설정

l1.bias.data,l1.weight.data
(tensor([-0.6726]), tensor([[3.3696]]))
l1.bias.data=torch.tensor([-3.0])
l1.weight.data=torch.tensor([[-1.0]])
l1.bias.data,l1.weight.data
(tensor([-3.]), tensor([[-1.]]))

- MSEloss를 이용하여 학습+기록

w0_mse=[] 
w1_mse=[]
loss_mse=[]
for epoc in range(1000): 
    ## 1 
    yhat=net(X) 
    ## 2 
    loss= torch.mean((y-yhat)**2)
    ## 3 
    loss.backward()
    ## 4 
    optimizer.step()
    net.zero_grad()
    ## 5 
    if epoc%20 == 0: 
        w0_mse.append(l1.bias.data.item())
        w1_mse.append(l1.weight.data.item())
        loss_mse.append(loss.item())
l1.bias.data,l1.weight.data
(tensor([-0.9688]), tensor([[0.7116]]))

- plot

from matplotlib import animation
plt.rcParams["animation.html"] = "jshtml"
fig = plt.figure()
ax1= fig.add_subplot(2,2,1,projection='3d')
ax2= fig.add_subplot(2,2,2,projection='3d')
ax3= fig.add_subplot(2,2,3)
ax4= fig.add_subplot(2,2,4)
ax1.elev = 15
ax2.elev = 15
ax1.azim = 75
ax2.azim = 75
fig.set_figheight(15)
fig.set_figwidth(15)

### init plot 
ax1.scatter(_w0,_w1,_l1,s=0.05)
ax2.scatter(_w0,_w1,_l2,s=0.05)
ax1.scatter(-3,-1,lossfn_crossenp(-3,-1),color='gray') ## bceloss(binary cross entropy loss)
ax1.scatter(-1,5.1,lossfn_crossenp(-1,5.1),s=200,color='red',marker='*') ## bceloss
ax2.scatter(-3,-1,lossfn_mse(-3,-1),color='gray') ## mseloss 
ax2.scatter(-1,5.1,lossfn_mse(-1,5.1),s=200,color='red',marker='*') ## mseloss
ax3.scatter(X,y,alpha=0.01)
ax3.plot(X,v,'--')
line3, = ax3.plot(X,1/(1+torch.exp(-w0_bce[0]-w1_bce[0]*X)),'--')
ax4.scatter(X,y,alpha=0.01)
ax4.plot(X,v,'--')
line4, = ax4.plot(X,1/(1+torch.exp(-w0_mse[0]-w1_mse[0]*X)),'--')


### animation 
def animate(i):
    ax1.scatter(w0_bce[i],w1_bce[i],lossfn_crossenp(w0_bce[i],w1_bce[i]),color='gray')
    ax2.scatter(w0_mse[i],w1_mse[i],lossfn_mse(w0_mse[i],w1_mse[i]),color='gray')
    line3.set_ydata(1/(1+torch.exp(-w0_bce[i]-w1_bce[i]*X)))
    line4.set_ydata(1/(1+torch.exp(-w0_mse[i]-w1_mse[i]*X)))
    return line3,line4

ani = animation.FuncAnimation(fig, animate, frames=50)
plt.close()
ani

초기값 $(w_0,w_1)=(-10,-1)$을 대입하고 수렴과정을 animation으로 관찰하자.

- 파라메터 초기값 $(w_0,w_1)=(-10,-1)$로 설정

l1.bias.data, l1.weight.data
(tensor([-0.9688]), tensor([[0.7116]]))
l1.bias.data=torch.tensor([-10.0])
l1.weight.data=torch.tensor([[-1.0]])
l1.bias.data, l1.weight.data
(tensor([-10.]), tensor([[-1.]]))

- BCEloss를 이용하여 학습+기록

w0_bce=[] 
w1_bce=[]
loss_bce=[]
for epoc in range(1000): 
    ## 1 
    yhat=net(X) 
    ## 2 
    loss= - torch.mean(y*torch.log(yhat) + (1-y)*torch.log(1-yhat)) 
    ## 3 
    loss.backward()
    ## 4 
    optimizer.step()
    net.zero_grad()
    ## 5 
    if epoc%20 == 0: 
        w0_bce.append(l1.bias.data.item())
        w1_bce.append(l1.weight.data.item())
        loss_bce.append(loss.item())
l1.bias.data, l1.weight.data
(tensor([-0.8302]), tensor([[4.0264]]))

- 파라메터 초기값 $(w_0,w_1)=(-10,-1)$로 설정

l1.bias.data, l1.weight.data
(tensor([-0.8302]), tensor([[4.0264]]))
l1.bias.data=torch.tensor([-10.0])
l1.weight.data=torch.tensor([[-1.0]])
l1.bias.data, l1.weight.data
(tensor([-10.]), tensor([[-1.]]))

- MSEloss를 이용하여 학습+기록

w0_mse=[] 
w1_mse=[]
loss_mse=[]
for epoc in range(1000): 
    ## 1 
    yhat=net(X) 
    ## 2 
    loss= torch.mean((y-yhat)**2)
    ## 3 
    loss.backward()
    ## 4 
    optimizer.step()
    net.zero_grad()
    ## 5 
    if epoc%20 == 0: 
        w0_mse.append(l1.bias.data.item())
        w1_mse.append(l1.weight.data.item())
        loss_mse.append(loss.item())
l1.bias.data, l1.weight.data
(tensor([-9.9990]), tensor([[-0.9995]]))

- plot

fig = plt.figure()
ax1= fig.add_subplot(2,2,1,projection='3d')
ax2= fig.add_subplot(2,2,2,projection='3d')
ax3= fig.add_subplot(2,2,3)
ax4= fig.add_subplot(2,2,4)
ax1.elev = 15
ax2.elev = 15
ax1.azim = 75
ax2.azim = 75
fig.set_figheight(15)
fig.set_figwidth(15)

### init plot 
ax1.scatter(_w0,_w1,_l1,s=0.05)
ax2.scatter(_w0,_w1,_l2,s=0.05)
ax1.scatter(-10,-1,lossfn_crossenp(-10,-1),color='gray') ## bceloss
ax1.scatter(-1,5.1,lossfn_crossenp(-1,5.1),s=200,color='red',marker='*') ## bceloss
ax2.scatter(-10,-1,lossfn_mse(-10,-1),color='gray') ## mseloss 
ax2.scatter(-1,5.1,lossfn_mse(-1,5.1),s=200,color='red',marker='*') ## mseloss
ax3.scatter(X,y,alpha=0.01)
ax3.plot(X,v,'--')
line3, = ax3.plot(X,1/(1+torch.exp(-w0_bce[0]-w1_bce[0]*X)),'--')
ax4.scatter(X,y,alpha=0.01)
ax4.plot(X,v,'--')
line4, = ax4.plot(X,1/(1+torch.exp(-w0_mse[0]-w1_mse[0]*X)),'--')


### animation 
def animate(i):
    ax1.scatter(w0_bce[i],w1_bce[i],lossfn_crossenp(w0_bce[i],w1_bce[i]),color='gray')
    ax2.scatter(w0_mse[i],w1_mse[i],lossfn_mse(w0_mse[i],w1_mse[i]),color='gray')
    line3.set_ydata(1/(1+torch.exp(-w0_bce[i]-w1_bce[i]*X)))
    line4.set_ydata(1/(1+torch.exp(-w0_mse[i]-w1_mse[i]*X)))
    return line3,line4

ani = animation.FuncAnimation(fig, animate, frames=50)
plt.close()
ani
</input>