import

import torch 
from fastai.collab import * 
from fastai.tabular.all import * 

data

path = untar_data(URLs.ML_100k) 

- 첫번째 데이터프레임

path.ls()
(#23) [Path('/home/csy/.fastai/data/ml-100k/ua.test'),Path('/home/csy/.fastai/data/ml-100k/u4.base'),Path('/home/csy/.fastai/data/ml-100k/u.user'),Path('/home/csy/.fastai/data/ml-100k/README'),Path('/home/csy/.fastai/data/ml-100k/u2.base'),Path('/home/csy/.fastai/data/ml-100k/u1.base'),Path('/home/csy/.fastai/data/ml-100k/ub.test'),Path('/home/csy/.fastai/data/ml-100k/ub.base'),Path('/home/csy/.fastai/data/ml-100k/u3.base'),Path('/home/csy/.fastai/data/ml-100k/ua.base')...]
ratings=pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user','movie','rating','timestamp'])
ratings
user movie rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596
... ... ... ... ...
99995 880 476 3 880175444
99996 716 204 5 879795543
99997 276 1090 1 874795795
99998 13 225 2 882399156
99999 12 203 3 879959583

100000 rows × 4 columns

  • 마지막열은 무의믜

최소한 movie 번호가 어떤 것을 의미하는지 알아야 모형을 세우지 않을까?

- 두번째 데이터프레임

movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1', usecols=(0,1), names=('movie','title'), header=None)
movies
movie title
0 1 Toy Story (1995)
1 2 GoldenEye (1995)
2 3 Four Rooms (1995)
3 4 Get Shorty (1995)
4 5 Copycat (1995)
... ... ...
1677 1678 Mat' i syn (1997)
1678 1679 B. Monkey (1998)
1679 1680 Sliding Doors (1998)
1680 1681 You So Crazy (1994)
1681 1682 Scream of Stone (Schrei aus Stein) (1991)

1682 rows × 2 columns

- 두 데이터프레임을 합친다.

df = ratings.merge(movies)
df
user movie rating timestamp title
0 196 242 3 881250949 Kolya (1996)
1 63 242 3 875747190 Kolya (1996)
2 226 242 5 883888671 Kolya (1996)
3 154 242 3 879138235 Kolya (1996)
4 306 242 5 876503793 Kolya (1996)
... ... ... ... ... ...
99995 840 1674 4 891211682 Mamma Roma (1962)
99996 655 1640 3 888474646 Eighth Day, The (1996)
99997 655 1637 3 888984255 Girls Town (1996)
99998 655 1630 3 887428735 Silence of the Palace, The (Saimt el Qusur) (1994)
99999 655 1641 3 887427810 Dadetown (1995)

100000 rows × 5 columns

dls

bs = batch size

dls = CollabDataLoaders.from_df(df,bs=64,item_name='title') 
dls.show_batch()
user title rating
0 292 Sting, The (1973) 4
1 933 Chinatown (1974) 4
2 532 Escape from L.A. (1996) 2
3 29 L.A. Confidential (1997) 4
4 532 Top Gun (1986) 5
5 894 Austin Powers: International Man of Mystery (1997) 3
6 202 Apartment, The (1960) 1
7 291 Brady Bunch Movie, The (1995) 4
8 514 Jerry Maguire (1996) 4
9 373 Young Guns (1988) 3

learn

lrnr = collab_learner(dls, n_factors=10, y_range=(0,5)) 
lrnr.fit(13) 
epoch train_loss valid_loss time
0 1.141182 1.115039 00:05
1 0.922928 0.911456 00:05
2 0.861752 0.870785 00:04
3 0.816649 0.850283 00:05
4 0.776631 0.839327 00:05
5 0.777621 0.832916 00:06
6 0.784220 0.828082 00:05
7 0.779736 0.826487 00:05
8 0.773721 0.824381 00:06
9 0.760006 0.822208 00:05
10 0.730945 0.820367 00:04
11 0.718741 0.817455 00:04
12 0.713176 0.816797 00:05

책에서는 5를 포함하지 않는다~해서 5.5하긴했음.

  • 교재의 loss도 0.82 근처

- 결과를 살펴보자.

lrnr.show_results()
user title rating rating_pred
0 197 622 4 4.597176
1 181 1153 1 2.050153
2 234 1290 3 3.270310
3 561 1565 3 2.120189
4 387 116 1 2.539219
5 280 1501 3 3.904207
6 272 457 4 4.706821
7 78 781 5 2.871252
8 442 1285 4 2.974716
  • 솔직히 다 맞추는 느낌이 있진 않음

리얼데이터이다보니 더 복잡한 취향이 있다는 것이 느껴짐

learn2

비선형 mapping을 추가하면 모형이 더 좋아지지 않을까?

lrnr2 = collab_learner(dls, use_nn=True, y_range=(0,5), layers=[20,10]) 
lrnr2.fit(8)
epoch train_loss valid_loss time
0 0.963282 0.927208 00:07
1 0.896801 0.891777 00:07
2 0.835703 0.876417 00:07
3 0.810163 0.866659 00:07
4 0.840598 0.873119 00:06
5 0.783492 0.871686 00:07
6 0.751926 0.883648 00:07
7 0.705028 0.886023 00:07

떨어자다가 올라가는 확률, overfitting의 의심

lrnr2.show_results()
user title rating rating_pred
0 377 1607 4 3.860305
1 493 572 5 3.343074
2 882 1102 2 4.298423
3 183 1398 4 3.070796
4 897 910 5 3.584368
5 257 1487 5 4.126847
6 542 93 4 4.080160
7 495 943 3 3.956127
8 921 144 1 2.097149
  • 적당한 수준에서 합리적임

bias

lrnr.model
EmbeddingDotBias(
  (u_weight): Embedding(944, 10)
  (i_weight): Embedding(1665, 10)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

user / item / 각각의 bias

미분 제거 위해 detach

lrnr.model.i_bias.weight.detach().to('cpu').squeeze()
tensor([-0.0086, -0.1385, -0.0024,  ..., -0.0054,  0.2142,  0.0920])
  • 의미? item의 bias: 평균적으로 높은 평점을 받거나 낮은 평점을 받는 영화들이있는데, 그 정도를 숫자로 표현

인덱스가 나오게 됨

lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort()
tensor([ 295,    9,  357,  ..., 1318,  830, 1282])
lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort(descending=True)
tensor([1282,  830, 1318,  ...,  357,    9,  295])
lst1=lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort()[:20].tolist()
lst2=lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort(descending=True)[:20].tolist()
list(dls.classes['title'][lst1])
['Children of the Corn: The Gathering (1996)',
 '3 Ninjas: High Noon At Mega Mountain (1998)',
 'Crow: City of Angels, The (1996)',
 'Barb Wire (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Jury Duty (1995)',
 'Amityville 3-D (1983)',
 'Beverly Hills Ninja (1997)',
 'Big Bully (1996)',
 'Amityville II: The Possession (1982)',
 'Free Willy 3: The Rescue (1997)',
 'Best of the Best 3: No Turning Back (1995)',
 'Amityville: A New Generation (1993)',
 'Mortal Kombat: Annihilation (1997)',
 'Being Human (1993)',
 'Bushwhacked (1995)',
 'Kansas City (1996)',
 'Getting Even with Dad (1994)',
 "Amityville 1992: It's About Time (1992)",
 'Body Parts (1991)']
  • 비인기
list(dls.classes['title'][lst2])
["Schindler's List (1993)",
 'L.A. Confidential (1997)',
 'Shawshank Redemption, The (1994)',
 'Apt Pupil (1998)',
 'Good Will Hunting (1997)',
 'Wrong Trousers, The (1993)',
 'Close Shave, A (1995)',
 'Vertigo (1958)',
 'Rear Window (1954)',
 'North by Northwest (1959)',
 'As Good As It Gets (1997)',
 'Silence of the Lambs, The (1991)',
 'Titanic (1997)',
 'Grand Day Out, A (1992)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Pather Panchali (1955)',
 'Godfather, The (1972)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 "One Flew Over the Cuckoo's Nest (1975)",
 '12 Angry Men (1957)']
  • 인기

- 모형이 잘 학습된것 같다.

예측

lst2[12]
1501
lst1[10]
561

내 모형에는 로보캅이 없으,ㅁ..

- 타이타닉(1501)과 로보캅3(1251)에 관심을 가지자.

x,y = dls.one_batch()

user, item

x[:5]
tensor([[ 582, 1634],
        [ 889, 1457],
        [   2,  830],
        [ 435,  181],
        [ 156,  768]])

- 1~30번까지의 유저가 타이타닉(1501)을 어떻게 생각할지? 재미있게 생각한다.

xx = torch.tensor([[i,1501] for i in range(1,31)])
lrnr.model(xx.to("cuda:0"))
tensor([4.4197, 4.4980, 3.5523, 4.8458, 3.6919, 3.2887, 4.4829, 4.7335, 4.7594,
        4.5789, 4.1758, 4.8316, 4.2905, 4.4045, 3.9606, 4.9019, 3.6463, 4.0611,
        3.9284, 3.7203, 4.0472, 4.6268, 3.9310, 4.6264, 4.6121, 4.0433, 4.1479,
        4.4882, 4.2470, 4.4125], device='cuda:0', grad_fn=<AddBackward0>)
lrnr2.model(xx.to("cuda:0")).reshape(-1)
tensor([4.1886, 4.2200, 3.4936, 4.6669, 3.4627, 3.7298, 4.6952, 4.3740, 4.5935,
        4.4910, 4.3225, 4.7935, 4.0277, 4.4695, 4.0514, 4.5644, 3.5330, 4.3813,
        4.1142, 3.9044, 3.6935, 4.4045, 4.1479, 4.3640, 4.5667, 4.1902, 4.1374,
        4.5313, 4.0228, 4.4573], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)

- 1~30번까지의 유저가 로보캅3(1251)을 어떻게 생각할지? 재미없게 생각한다.

xx = torch.tensor([[i,1251] for i in range(1,31)])
lrnr.model(xx.to("cuda:0"))
tensor([1.0677, 1.3579, 1.6918, 1.7708, 1.5957, 1.2411, 2.3900, 1.3234, 1.8278,
        2.1464, 2.1753, 1.7971, 1.1096, 2.0255, 1.5500, 1.5622, 1.6522, 1.7865,
        2.1084, 2.3826, 1.0598, 1.0457, 1.8304, 1.7866, 2.3392, 1.4715, 1.8355,
        1.8598, 1.9085, 2.2277], device='cuda:0', grad_fn=<AddBackward0>)
lrnr2.model(xx.to("cuda:0")).reshape(-1)
tensor([0.7704, 1.2472, 1.3592, 2.6368, 0.9860, 0.6288, 2.1226, 0.8837, 1.7301,
        1.5910, 1.9007, 2.4225, 0.7369, 1.9897, 0.9226, 1.8317, 0.6634, 1.5373,
        2.2758, 1.6569, 0.9513, 0.7802, 1.3581, 2.3710, 2.5675, 1.0284, 1.2341,
        1.2410, 1.8373, 1.7973], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)