import torch
import numpy as np

Data

- model: $y_i= w_0+w_1 x_i +\epsilon_i = 2.5 + 4x_i +\epsilon_i, \quad i=1,2,\dots,n$

- model: ${\bf y}={\bf X}{\bf W} +\boldsymbol{\epsilon}$

${\bf y}=\begin{bmatrix} y_1 \\ y_2 \\ \dots \\ y_n\end{bmatrix}, \quad {\bf X}=\begin{bmatrix} 1 & x_1 \\ 1 & x_2 \\ \dots \\ 1 & x_n\end{bmatrix}, \quad {\bf W}=\begin{bmatrix} 2.5 \\ 4 \end{bmatrix}, \quad \boldsymbol{\epsilon}= \begin{bmatrix} \epsilon_1 \\ \dots \\ \epsilon_n\end{bmatrix}$

torch.manual_seed(202150754)
n=100
ones=torch.ones(n)
x,_=torch.randn(n).sort()
X=torch.vstack([ones,x]).T
W=torch.tensor([2.5,4])
ϵ=torch.randn(n)*0.4
y=X@W+ϵ
ytrue = X@W

- 지난 시간에 했던 for문을 보고 step 찾기

What= torch.tensor([-5.0,10.0],requires_grad=True)
for epoc in range(30): 
    Whats=Whats+[What.data.tolist()] 
    What.grad=None
    yhat=X@What # yhat을 계산하는 공식을 알고 있거나, 과정이 필요
    yhats=yhats+[yhat.data.tolist()]
    loss=torch.sum((y-yhat)**2) # loss정의하는 과정이 필요!
    losses = losses + [loss.item()]
    loss.backward() # 미분을 하는 과정이 필요
    What.data = What.data-alpha * What.grad.data # 미분한 값을 가지고 업데이트를 하는 과정

이전방법요약

- step1: yhat

- step2: loss

- step3: derivation

- step4: update

step1: yhat

- feedforward 신경망을 설계하는 과정(딥러닝 용어!)

  • 입력층으로 데이터가 입력되고, 1개 이상으로 구성되는 은닉 층을 거쳐서 마지막에 있는 출력 층으로 출력 값을 내보내는 과정.
  • 이전 틍에서 나온 출력 값이, 층과 층 사이에 적용되는 가중치 영향을 받은 다음 다음 층의 입력 값으로 들어가는 것

- 이 단계가 잘 완료되었다면, 임의의 ${\bf\hat{W}}$을 넣었을 때 $\bf\hat{y}$를 계산할 수 있어야 함

방법1: 직접선언_직접 곱하는 것 (내가 공식을 알고 있어야 한다)

- model: ${\bf y}={\bf X}{\bf W} +\boldsymbol{\epsilon}$

What=torch.tensor([-5.0,10.0],requires_grad=True)
yhat1=X@What
yhat1
tensor([-27.9716, -26.0391, -25.8951, -24.1830, -23.6405, -23.1161, -22.0441,
        -21.9913, -21.4959, -21.2860, -20.4771, -19.6991, -19.1434, -18.0758,
        -17.5390, -17.4888, -16.8212, -16.6630, -16.2503, -14.3326, -13.8527,
        -13.6397, -13.5228, -13.2096, -12.8514, -12.8461, -12.7527, -12.2431,
        -12.0267, -11.7990, -11.6495, -11.5587, -11.5497, -11.1709, -10.9643,
        -10.7969, -10.7696, -10.7324, -10.6567, -10.4404, -10.1049,  -9.9527,
         -9.7916,  -9.3899,  -9.2762,  -8.2773,  -8.0850,  -7.9550,  -7.8498,
         -7.7767,  -7.6419,  -7.2295,  -7.1686,  -6.9773,  -6.9454,  -6.6435,
         -5.6597,  -5.5200,  -5.4562,  -5.3640,  -4.9588,  -4.9111,  -4.5447,
         -3.9894,  -3.6367,  -3.0762,  -2.4928,  -2.4512,  -2.1695,  -2.0062,
         -1.7060,   0.1909,   0.5915,   0.9467,   1.3453,   1.4359,   2.0752,
          2.4723,   2.5368,   2.7189,   2.7902,   2.8337,   3.2249,   3.7238,
          3.8636,   3.9170,   3.9852,   5.0601,   5.7496,   6.0569,   7.0621,
          7.2674,   7.6805,   7.9669,   8.4266,   9.6044,   9.6791,  10.7418,
         12.6324,  18.9507], grad_fn=<MvBackward0>)

방법2: torch.nn.Linear() 사용_우리가 주로 사용할 방법_Neural Network

? 두 번 입력하면 코드까지 다나온다~~

torch.nn.Linear??
Init signature:
torch.nn.Linear(
    in_features: int,
    out_features: int,
    bias: bool = True,
    device=None,
    dtype=None,
) -> None
Source:        
class Linear(Module):
    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :math:`(\text{out\_features}, \text{in\_features})`. The values are
            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
            :math:`k = \frac{1}{\text{in\_features}}`
        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                If :attr:`bias` is ``True``, the values are initialized from
                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                :math:`k = \frac{1}{\text{in\_features}}`

    Examples::

        >>> m = nn.Linear(20, 30)
        >>> input = torch.randn(128, 20)
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
    """
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: Tensor

    def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: Tensor) -> Tensor:
        return F.linear(input, self.weight, self.bias)

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )
File:           ~/anaconda3/envs/csy/lib/python3.8/site-packages/torch/nn/modules/linear.py
Type:           type
Subclasses:     NonDynamicallyQuantizableLinear, LazyLinear, Linear, Linear
  • ${\bf y}=\begin{bmatrix} y_1 \\ y_2 \\ \dots \\ y_n\end{bmatrix}, \quad {\bf X}=\begin{bmatrix} 1 & x_1 \\ 1 & x_2 \\ \dots \\ 1 & x_n\end{bmatrix}, \quad {\bf W}=\begin{bmatrix} 2.5 \\ 4 \end{bmatrix}, \quad \boldsymbol{\epsilon}= \begin{bmatrix} \epsilon_1 \\ \dots \\ \epsilon_n\end{bmatrix}$
net=torch.nn.Linear(in_features=2,out_features=1,bias=False) 
  • in_features – size of each input sample
  • out_features – size of each output sample
  • bias – If set to False, the layer will not learn an additive bias. Default: True
  • Linear라는 클래스를 사용하여, in_features입력차원이자 $X(1,x_i)$라서 2, outfeatures출력차원이자 $y{hat}$라서 1
  • bias가 False 인 이유는 $X(1,x_i)$에서 1이 bias의 역할을 하고 있기 때문에 필요가 없어서
net.weight.data # 우리가 원하는 입력값이 아니야
tensor([[-0.1629,  0.6929]])
net.weight.data=torch.tensor([[-5.0,10.0]])
net.weight.data
tensor([[-5., 10.]])
net(X)
tensor([[-27.9716],
        [-26.0391],
        [-25.8951],
        [-24.1830],
        [-23.6405],
        [-23.1161],
        [-22.0441],
        [-21.9913],
        [-21.4959],
        [-21.2860],
        [-20.4771],
        [-19.6991],
        [-19.1434],
        [-18.0758],
        [-17.5390],
        [-17.4888],
        [-16.8212],
        [-16.6630],
        [-16.2503],
        [-14.3326],
        [-13.8527],
        [-13.6397],
        [-13.5228],
        [-13.2096],
        [-12.8514],
        [-12.8461],
        [-12.7527],
        [-12.2431],
        [-12.0267],
        [-11.7990],
        [-11.6495],
        [-11.5587],
        [-11.5497],
        [-11.1709],
        [-10.9643],
        [-10.7969],
        [-10.7696],
        [-10.7324],
        [-10.6567],
        [-10.4404],
        [-10.1049],
        [ -9.9527],
        [ -9.7916],
        [ -9.3899],
        [ -9.2762],
        [ -8.2773],
        [ -8.0850],
        [ -7.9550],
        [ -7.8498],
        [ -7.7767],
        [ -7.6419],
        [ -7.2295],
        [ -7.1686],
        [ -6.9773],
        [ -6.9454],
        [ -6.6435],
        [ -5.6597],
        [ -5.5200],
        [ -5.4562],
        [ -5.3640],
        [ -4.9588],
        [ -4.9111],
        [ -4.5447],
        [ -3.9894],
        [ -3.6367],
        [ -3.0762],
        [ -2.4928],
        [ -2.4512],
        [ -2.1695],
        [ -2.0062],
        [ -1.7060],
        [  0.1909],
        [  0.5915],
        [  0.9467],
        [  1.3453],
        [  1.4359],
        [  2.0752],
        [  2.4723],
        [  2.5368],
        [  2.7189],
        [  2.7902],
        [  2.8337],
        [  3.2249],
        [  3.7238],
        [  3.8636],
        [  3.9170],
        [  3.9852],
        [  5.0601],
        [  5.7496],
        [  6.0569],
        [  7.0621],
        [  7.2674],
        [  7.6805],
        [  7.9669],
        [  8.4266],
        [  9.6044],
        [  9.6791],
        [ 10.7418],
        [ 12.6324],
        [ 18.9507]], grad_fn=<MmBackward0>)
yhat2=net(X)
  • 네트워크에 X를 입력으로 넣음으로써 y를 구하는 법

방법3: torch.nn.Linear()사용, bias=True

- model: $y_i= w_0+w_1 x_i +\epsilon_i = 2.5 + 4x_i +\epsilon_i, \quad i=1,2,\dots,n$

net=torch.nn.Linear(in_features=1,out_features=1,bias=True)
  • Linear라는 클래스를 사용하여, in_features입력차원이자 $x_i$라서 1, outfeatures출력차원이자 $y{hat}$라서 1
  • bias가 True 인 이유는 bias term상수항이 존재하는 네트워크라 정의했기 때문
net.weight.data # 입력차원을 1로 했기 때문에 출력차원 1개만 값이 나온 것.
tensor([[-0.9442]])
net.weight.data=torch.tensor([[10.0]]) # tensor 괄호 수 따라가..
net.bias.data=torch.tensor([-5.0])
net.weight,net.bias
(Parameter containing:
 tensor([[10.]], requires_grad=True),
 Parameter containing:
 tensor([-5.], requires_grad=True))
  • $X$가 아니라 $x(즉, x_i)$를 넣어야 함
  • net(x.reshape(100,1)) x.reshape은 100값이 나오지만, (100,1)로 차원 명시 해주기
net(x.reshape(100,1))
tensor([[-27.9716],
        [-26.0391],
        [-25.8951],
        [-24.1830],
        [-23.6405],
        [-23.1161],
        [-22.0441],
        [-21.9913],
        [-21.4959],
        [-21.2860],
        [-20.4771],
        [-19.6991],
        [-19.1434],
        [-18.0758],
        [-17.5390],
        [-17.4888],
        [-16.8212],
        [-16.6630],
        [-16.2503],
        [-14.3326],
        [-13.8527],
        [-13.6397],
        [-13.5228],
        [-13.2096],
        [-12.8514],
        [-12.8461],
        [-12.7527],
        [-12.2431],
        [-12.0267],
        [-11.7990],
        [-11.6495],
        [-11.5587],
        [-11.5497],
        [-11.1709],
        [-10.9643],
        [-10.7969],
        [-10.7696],
        [-10.7324],
        [-10.6567],
        [-10.4404],
        [-10.1049],
        [ -9.9527],
        [ -9.7916],
        [ -9.3899],
        [ -9.2762],
        [ -8.2773],
        [ -8.0850],
        [ -7.9550],
        [ -7.8498],
        [ -7.7767],
        [ -7.6419],
        [ -7.2295],
        [ -7.1686],
        [ -6.9773],
        [ -6.9454],
        [ -6.6435],
        [ -5.6597],
        [ -5.5200],
        [ -5.4562],
        [ -5.3640],
        [ -4.9588],
        [ -4.9111],
        [ -4.5447],
        [ -3.9894],
        [ -3.6367],
        [ -3.0762],
        [ -2.4928],
        [ -2.4512],
        [ -2.1695],
        [ -2.0062],
        [ -1.7060],
        [  0.1909],
        [  0.5915],
        [  0.9467],
        [  1.3453],
        [  1.4359],
        [  2.0752],
        [  2.4723],
        [  2.5368],
        [  2.7189],
        [  2.7902],
        [  2.8337],
        [  3.2249],
        [  3.7238],
        [  3.8636],
        [  3.9170],
        [  3.9852],
        [  5.0601],
        [  5.7496],
        [  6.0569],
        [  7.0621],
        [  7.2674],
        [  7.6805],
        [  7.9669],
        [  8.4266],
        [  9.6044],
        [  9.6791],
        [ 10.7418],
        [ 12.6324],
        [ 18.9507]], grad_fn=<AddmmBackward0>)
  • 방법 2와 3의 결과가 같아서 방법 3의 yhat은 따로 변수화하지 않음

step2: loss

방법1: 손실함수를 직접정의하는 방법

loss=torch.mean((y-yhat1)**2)
loss
tensor(109.8145, grad_fn=<MeanBackward0>)
loss=torch.mean((y-yhat2)**2)
loss
tensor(187.0950, grad_fn=<MeanBackward0>)
  • 두 값이 다르기도 하고, 187.0950는 잘못된 결과
  • grad_fn= 주의깊게 보기</li> </ul> </div> </div> </div>
    (y-yhat2).shape
    
    torch.Size([100, 100])
    y.shape
    
    torch.Size([100])
    yhat2.shape
    
    torch.Size([100, 1])
    • 차원에 대한 개념 파악 너무 중요$\star$
    torch.mean((y-yhat2.flatten())**2) # yhat2를 벡터화시키는 방법 1
    
    tensor(109.8145, grad_fn=<MeanBackward0>)
    loss=torch.mean((y.reshape(100,1)-yhat2)**2) # y의 dhape을 바꿔주는 방법 2
    loss
    
    tensor(109.8145, grad_fn=<MeanBackward0>)

    방법2: torch.nn.MSELoss()를 사용하여 손실함수를 정의하는 방법

    torch.nn.MSELoss는 class함수

    torch.nn.MSELoss??
    
    Init signature: torch.nn.MSELoss(size_average=None, reduce=None, reduction: str = 'mean') -> None
    Source:        
    class MSELoss(_Loss):
        r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
        each element in the input :math:`x` and target :math:`y`.
    
        The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
    
        .. math::
            \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
            l_n = \left( x_n - y_n \right)^2,
    
        where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
        (default ``'mean'``), then:
    
        .. math::
            \ell(x, y) =
            \begin{cases}
                \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
                \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
            \end{cases}
    
        :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
        of :math:`n` elements each.
    
        The mean operation still operates over all the elements, and divides by :math:`n`.
    
        The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
    
        Args:
            size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
                the losses are averaged over each loss element in the batch. Note that for
                some losses, there are multiple elements per sample. If the field :attr:`size_average`
                is set to ``False``, the losses are instead summed for each minibatch. Ignored
                when :attr:`reduce` is ``False``. Default: ``True``
            reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
                losses are averaged or summed over observations for each minibatch depending
                on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
                batch element instead and ignores :attr:`size_average`. Default: ``True``
            reduction (string, optional): Specifies the reduction to apply to the output:
                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
                ``'mean'``: the sum of the output will be divided by the number of
                elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
                and :attr:`reduce` are in the process of being deprecated, and in the meantime,
                specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
    
        Shape:
            - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
            - Target: :math:`(*)`, same shape as the input.
    
        Examples::
    
            >>> loss = nn.MSELoss()
            >>> input = torch.randn(3, 5, requires_grad=True)
            >>> target = torch.randn(3, 5)
            >>> output = loss(input, target)
            >>> output.backward()
        """
        __constants__ = ['reduction']
    
        def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
            super(MSELoss, self).__init__(size_average, reduce, reduction)
    
        def forward(self, input: Tensor, target: Tensor) -> Tensor:
            return F.mse_loss(input, target, reduction=self.reduction)
    File:           ~/anaconda3/envs/csy/lib/python3.8/site-packages/torch/nn/modules/loss.py
    Type:           type
    Subclasses:     
    

    definition of torch.nn.MSELoss

    • Creates a criterion that measures the mean squared error (squared L2 norm) between each element in the input $xx$ and target $yy$.
    lossfn=torch.nn.MSELoss() # 단지 입력만하면 loss function이 만들어짐
    
    loss=lossfn(y,yhat1)
    loss
    
    tensor(109.8145, grad_fn=<MseLossBackward0>)
    loss=lossfn(y.reshape(100,1),yhat2)
    loss
    
    tensor(109.8145, grad_fn=<MseLossBackward0>)
    • 미분 꼬리표 붙은 모습! grad_fn=</li> </ul> </div> </div> </div>
      • 총 6가지 조합으로 loss를 구할 수 있다.

      - model: $y_i= w_0+w_1 x_{i1}+w_2 x_{i2} +\epsilon_i = 2.5 + 4x_{1i} + -2x_{2i}+\epsilon_i, \quad i=1,2,\dots,n$

      torch.manual_seed(202150754)
      n=100
      ones=torch.ones(n)
      x1,_=torch.randn(n).sort()
      x2,_=torch.randn(n).sort()
      X=torch.vstack([ones,x1,x2]).T
      W=torch.tensor([2.5,4.0,-2.0])
      ϵ=torch.randn(n)*0.5
      y=X@W+ϵ
      ytrue=X@W
      
      net=torch.nn.Linear(in_features=3,out_features=1,bias=False)
      
      net.weight.data
      
      tensor([[0.1152, 0.4159, 0.3233]])
      net.weight.data=torch.tensor([[2.5,5.0,-2.0]])
      
      net.weight.data
      
      tensor([[ 2.5000,  5.0000, -2.0000]])
      net(X)
      
      tensor([[-2.6509],
              [-3.6075],
              [-3.5596],
              [-3.1381],
              [-3.3609],
              [-3.4499],
              [-3.2561],
              [-3.2889],
              [-3.2011],
              [-3.1503],
              [-2.7970],
              [-2.5058],
              [-2.3969],
              [-1.9746],
              [-1.7134],
              [-1.6971],
              [-1.4055],
              [-1.5425],
              [-1.4787],
              [-0.5325],
              [-0.5519],
              [-0.4983],
              [-0.5579],
              [-0.4252],
              [-0.3783],
              [-0.3975],
              [-0.4601],
              [-0.3162],
              [-0.2366],
              [-0.1835],
              [-0.1374],
              [-0.1168],
              [-0.1915],
              [-0.2164],
              [-0.1212],
              [-0.0441],
              [-0.0724],
              [-0.0834],
              [-0.0505],
              [-0.0193],
              [ 0.1136],
              [ 0.1171],
              [ 0.1026],
              [ 0.2901],
              [ 0.2309],
              [ 0.7120],
              [ 0.7832],
              [ 0.8135],
              [ 0.7912],
              [ 0.7812],
              [ 0.8180],
              [ 0.9363],
              [ 0.9441],
              [ 1.0259],
              [ 1.0002],
              [ 1.0770],
              [ 1.4815],
              [ 1.3693],
              [ 1.3779],
              [ 1.4225],
              [ 1.6098],
              [ 1.5941],
              [ 1.7537],
              [ 2.0001],
              [ 2.0957],
              [ 2.3735],
              [ 2.6115],
              [ 2.6187],
              [ 2.7009],
              [ 2.7603],
              [ 2.8919],
              [ 3.7759],
              [ 3.9732],
              [ 4.0655],
              [ 4.2620],
              [ 4.2635],
              [ 4.4946],
              [ 4.6567],
              [ 4.6651],
              [ 4.7379],
              [ 4.7674],
              [ 4.6795],
              [ 4.8486],
              [ 5.0364],
              [ 5.0838],
              [ 5.0771],
              [ 5.0922],
              [ 5.5662],
              [ 5.8656],
              [ 5.8907],
              [ 6.3554],
              [ 6.4389],
              [ 6.4179],
              [ 6.2463],
              [ 6.2930],
              [ 6.6489],
              [ 6.0923],
              [ 6.1693],
              [ 6.9698],
              [ 9.9439]], grad_fn=<MmBackward0>)
      yhat=net(X)
      
      lossfn=torch.nn.MSELoss()
      
      loss=lossfn(y,yhat)
      
      loss
      
      tensor(24.9339, grad_fn=<MseLossBackward0>)
      </div>