import

import torch 
from fastai.collab import * 
from fastai.tabular.all import *

data

path = untar_data(URLs.ML_100k)

- 첫번째 데이터프레임

path.ls()

(#23) [Path('/home/csy/.fastai/data/ml-100k/ua.test'),Path('/home/csy/.fastai/data/ml-100k/u4.base'),Path('/home/csy/.fastai/data/ml-100k/u.user'),Path('/home/csy/.fastai/data/ml-100k/README'),Path('/home/csy/.fastai/data/ml-100k/u2.base'),Path('/home/csy/.fastai/data/ml-100k/u1.base'),Path('/home/csy/.fastai/data/ml-100k/ub.test'),Path('/home/csy/.fastai/data/ml-100k/ub.base'),Path('/home/csy/.fastai/data/ml-100k/u3.base'),Path('/home/csy/.fastai/data/ml-100k/ua.base')...]

ratings=pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user','movie','rating','timestamp'])
ratings

마지막열은 무의믜

최소한 movie 번호가 어떤 것을 의미하는지 알아야 모형을 세우지 않을까?

- 두번째 데이터프레임

movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1', usecols=(0,1), names=('movie','title'), header=None)
movies

- 두 데이터프레임을 합친다.

df = ratings.merge(movies)
df

dls

bs = batch size

dls = CollabDataLoaders.from_df(df,bs=64,item_name='title') 
dls.show_batch()

learn

lrnr = collab_learner(dls, n_factors=10, y_range=(0,5)) 
lrnr.fit(13)

책에서는 5를 포함하지 않는다~해서 5.5하긴했음.

교재의 loss도 0.82 근처

- 결과를 살펴보자.

lrnr.show_results()

솔직히 다 맞추는 느낌이 있진 않음

리얼데이터이다보니 더 복잡한 취향이 있다는 것이 느껴짐

learn2

비선형 mapping을 추가하면 모형이 더 좋아지지 않을까?

lrnr2 = collab_learner(dls, use_nn=True, y_range=(0,5), layers=[20,10]) 
lrnr2.fit(8)

떨어자다가 올라가는 확률, overfitting의 의심

lrnr2.show_results()

적당한 수준에서 합리적임

bias

lrnr.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 10)
  (i_weight): Embedding(1665, 10)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

user / item / 각각의 bias

미분 제거 위해 detach

lrnr.model.i_bias.weight.detach().to('cpu').squeeze()

tensor([-0.0086, -0.1385, -0.0024,  ..., -0.0054,  0.2142,  0.0920])

의미? item의 bias: 평균적으로 높은 평점을 받거나 낮은 평점을 받는 영화들이있는데, 그 정도를 숫자로 표현

인덱스가 나오게 됨

lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort()

tensor([ 295,    9,  357,  ..., 1318,  830, 1282])

lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort(descending=True)

tensor([1282,  830, 1318,  ...,  357,    9,  295])

lst1=lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort()[:20].tolist()

lst2=lrnr.model.i_bias.weight.detach().to('cpu').squeeze().argsort(descending=True)[:20].tolist()

list(dls.classes['title'][lst1])

['Children of the Corn: The Gathering (1996)',
 '3 Ninjas: High Noon At Mega Mountain (1998)',
 'Crow: City of Angels, The (1996)',
 'Barb Wire (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Jury Duty (1995)',
 'Amityville 3-D (1983)',
 'Beverly Hills Ninja (1997)',
 'Big Bully (1996)',
 'Amityville II: The Possession (1982)',
 'Free Willy 3: The Rescue (1997)',
 'Best of the Best 3: No Turning Back (1995)',
 'Amityville: A New Generation (1993)',
 'Mortal Kombat: Annihilation (1997)',
 'Being Human (1993)',
 'Bushwhacked (1995)',
 'Kansas City (1996)',
 'Getting Even with Dad (1994)',
 "Amityville 1992: It's About Time (1992)",
 'Body Parts (1991)']

비인기

list(dls.classes['title'][lst2])

["Schindler's List (1993)",
 'L.A. Confidential (1997)',
 'Shawshank Redemption, The (1994)',
 'Apt Pupil (1998)',
 'Good Will Hunting (1997)',
 'Wrong Trousers, The (1993)',
 'Close Shave, A (1995)',
 'Vertigo (1958)',
 'Rear Window (1954)',
 'North by Northwest (1959)',
 'As Good As It Gets (1997)',
 'Silence of the Lambs, The (1991)',
 'Titanic (1997)',
 'Grand Day Out, A (1992)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Pather Panchali (1955)',
 'Godfather, The (1972)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 "One Flew Over the Cuckoo's Nest (1975)",
 '12 Angry Men (1957)']

인기

- 모형이 잘 학습된것 같다.

예측

lst2[12]

1501

lst1[10]

561

내 모형에는 로보캅이 없으,ㅁ..

- 타이타닉(1501)과 로보캅3(1251)에 관심을 가지자.

x,y = dls.one_batch()

user, item

x[:5]

tensor([[ 582, 1634],
        [ 889, 1457],
        [   2,  830],
        [ 435,  181],
        [ 156,  768]])

- 1~30번까지의 유저가 타이타닉(1501)을 어떻게 생각할지? 재미있게 생각한다.

xx = torch.tensor([[i,1501] for i in range(1,31)])

lrnr.model(xx.to("cuda:0"))

tensor([4.4197, 4.4980, 3.5523, 4.8458, 3.6919, 3.2887, 4.4829, 4.7335, 4.7594,
        4.5789, 4.1758, 4.8316, 4.2905, 4.4045, 3.9606, 4.9019, 3.6463, 4.0611,
        3.9284, 3.7203, 4.0472, 4.6268, 3.9310, 4.6264, 4.6121, 4.0433, 4.1479,
        4.4882, 4.2470, 4.4125], device='cuda:0', grad_fn=<AddBackward0>)

lrnr2.model(xx.to("cuda:0")).reshape(-1)

tensor([4.1886, 4.2200, 3.4936, 4.6669, 3.4627, 3.7298, 4.6952, 4.3740, 4.5935,
        4.4910, 4.3225, 4.7935, 4.0277, 4.4695, 4.0514, 4.5644, 3.5330, 4.3813,
        4.1142, 3.9044, 3.6935, 4.4045, 4.1479, 4.3640, 4.5667, 4.1902, 4.1374,
        4.5313, 4.0228, 4.4573], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)

- 1~30번까지의 유저가 로보캅3(1251)을 어떻게 생각할지? 재미없게 생각한다.

xx = torch.tensor([[i,1251] for i in range(1,31)])

lrnr.model(xx.to("cuda:0"))

tensor([1.0677, 1.3579, 1.6918, 1.7708, 1.5957, 1.2411, 2.3900, 1.3234, 1.8278,
        2.1464, 2.1753, 1.7971, 1.1096, 2.0255, 1.5500, 1.5622, 1.6522, 1.7865,
        2.1084, 2.3826, 1.0598, 1.0457, 1.8304, 1.7866, 2.3392, 1.4715, 1.8355,
        1.8598, 1.9085, 2.2277], device='cuda:0', grad_fn=<AddBackward0>)

lrnr2.model(xx.to("cuda:0")).reshape(-1)

tensor([0.7704, 1.2472, 1.3592, 2.6368, 0.9860, 0.6288, 2.1226, 0.8837, 1.7301,
        1.5910, 1.9007, 2.4225, 0.7369, 1.9897, 0.9226, 1.8317, 0.6634, 1.5373,
        2.2758, 1.6569, 0.9513, 0.7802, 1.3581, 2.3710, 2.5675, 1.0284, 1.2341,
        1.2410, 1.8373, 1.7973], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)

	user	movie	rating	timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596
...	...	...	...	...
99995	880	476	3	880175444
99996	716	204	5	879795543
99997	276	1090	1	874795795
99998	13	225	2	882399156
99999	12	203	3	879959583

	movie	title
0	1	Toy Story (1995)
1	2	GoldenEye (1995)
2	3	Four Rooms (1995)
3	4	Get Shorty (1995)
4	5	Copycat (1995)
...	...	...
1677	1678	Mat' i syn (1997)
1678	1679	B. Monkey (1998)
1679	1680	Sliding Doors (1998)
1680	1681	You So Crazy (1994)
1681	1682	Scream of Stone (Schrei aus Stein) (1991)

	user	movie	rating	timestamp	title
0	196	242	3	881250949	Kolya (1996)
1	63	242	3	875747190	Kolya (1996)
2	226	242	5	883888671	Kolya (1996)
3	154	242	3	879138235	Kolya (1996)
4	306	242	5	876503793	Kolya (1996)
...	...	...	...	...	...
99995	840	1674	4	891211682	Mamma Roma (1962)
99996	655	1640	3	888474646	Eighth Day, The (1996)
99997	655	1637	3	888984255	Girls Town (1996)
99998	655	1630	3	887428735	Silence of the Palace, The (Saimt el Qusur) (1994)
99999	655	1641	3	887427810	Dadetown (1995)

	user	title	rating
0	292	Sting, The (1973)	4
1	933	Chinatown (1974)	4
2	532	Escape from L.A. (1996)	2
3	29	L.A. Confidential (1997)	4
4	532	Top Gun (1986)	5
5	894	Austin Powers: International Man of Mystery (1997)	3
6	202	Apartment, The (1960)	1
7	291	Brady Bunch Movie, The (1995)	4
8	514	Jerry Maguire (1996)	4
9	373	Young Guns (1988)	3

epoch	train_loss	valid_loss	time
0	1.141182	1.115039	00:05
1	0.922928	0.911456	00:05
2	0.861752	0.870785	00:04
3	0.816649	0.850283	00:05
4	0.776631	0.839327	00:05
5	0.777621	0.832916	00:06
6	0.784220	0.828082	00:05
7	0.779736	0.826487	00:05
8	0.773721	0.824381	00:06
9	0.760006	0.822208	00:05
10	0.730945	0.820367	00:04
11	0.718741	0.817455	00:04
12	0.713176	0.816797	00:05

	user	title	rating	rating_pred
0	197	622	4	4.597176
1	181	1153	1	2.050153
2	234	1290	3	3.270310
3	561	1565	3	2.120189
4	387	116	1	2.539219
5	280	1501	3	3.904207
6	272	457	4	4.706821
7	78	781	5	2.871252
8	442	1285	4	2.974716

epoch	train_loss	valid_loss	time
0	0.963282	0.927208	00:07
1	0.896801	0.891777	00:07
2	0.835703	0.876417	00:07
3	0.810163	0.866659	00:07
4	0.840598	0.873119	00:06
5	0.783492	0.871686	00:07
6	0.751926	0.883648	00:07
7	0.705028	0.886023	00:07

	user	title	rating	rating_pred
0	377	1607	4	3.860305
1	493	572	5	3.343074
2	882	1102	2	4.298423
3	183	1398	4	3.070796
4	897	910	5	3.584368
5	257	1487	5	4.126847
6	542	93	4	4.080160
7	495	943	3	3.956127
8	921	144	1	2.097149