imports

import torch
import numpy as np 
import pandas as pd
from fastai.collab import *

나는 솔로

주절주절 intro

- Data

df_view = pd.read_csv('https://raw.githubusercontent.com/guebin/STML2022/main/posts/V.%20RecSys/2022-12-21-rcmdsolo.csv',index_col=0)
df_view

	영식	영철	영호	광수	상철	영수
옥순	3.9	4.1	NaN	0.5	0.3	NaN
영자	4.5	NaN	3.7	0.5	NaN	0.2
정숙	NaN	4.9	4.7	NaN	1.2	1.3
영숙	0.6	0.2	NaN	4.1	4.3	NaN
순자	0.7	0.9	NaN	4.2	NaN	3.9
현숙	NaN	0.2	0.3	NaN	3.5	3.4

- 데이터를 이해할때 필요한 가정들 – 내맘대로 한 설정임.

(옥순,영자,정숙)은 (영식,영철,영호)와 성격이 잘 맞고 (영숙,순자,현숙)은 (광수,상철,영수)와 성격이 잘맞음
((옥순,영자,정숙),(영식,영철,영호))은 MBTI가 I로 시작하고 ((영숙,순자,현숙),(광수,상철,영수))는 MBTI가 E로 시작한다.

- 목표: NaN 을 추론

- 수동추론:

(옥순,영호)이 만난다면? \(\to\) 둘다 I성향이니까 잘 맞지 않을까? \(\to\) 4.0 정도?
(정숙,영식)조합은? \(\to\) 둘다 I성향이니까 잘 맞지 않을까? + 정숙은 다 잘맞던데..? \(\to\) 4.8 정도?
(현숙,영식)조합은? \(\to\) 현숙은 E성향인데 영식은 I성향이므로 잘 안맞을 것임 + 현숙은 원래 좀 눈이 높음 \(\to\) 0.25 정도?

- 좀 더 체계적인 추론

사람들이 가지고 있는 성향들을 두 개의 숫자로 표현하자.

옥순의 성향 = (I성향,E성향) = (1.9, 0.0)
영식의 성향 = (I성향,E성향) = (2.0, 0.1)
현숙의 성향 = (I성향,E성향) = (0.0, 1.5)

(1) 옥순과 영식의 궁합 \(\approx\) 옥순의I성향\(\times\)영식의I성향 \(+\) 옥순의E성향\(\times\)영식의E성향 // 적합

a1= np.array([1.9,0.0]).reshape(2,1) # a1은 옥순의 성향, col-vec으로 선언하자. 
b1= np.array([2.0,0.1]).reshape(2,1) # b1은 영식의 성향, col-vec으로 선언하자.
(a1*b1).sum()

3.8

(2) 현숙과 영식의 궁합 \(\approx\) 현숙의I성향\(\times\)영식의I성향 \(+\) 현숙의E성향\(\times\)영식의E성향 // 예측

a6= np.array([0.0,1.5]).reshape(2,1)
(a6*b1).sum()

0.15000000000000002

그럴듯함..

- 모델링

아래가 같음을 관찰하라. (차원만 다름)

(a1*b1).sum(), a1.T@b1

(3.8, array([[3.8]]))

(a6*b1).sum(), a6.T@b1

(0.15000000000000002, array([[0.15]]))

만약에 여자의성향, 남자의성향을 적당한 매트릭스로 정리할 수 있다면 궁합매트릭스를 만들 수 있음

a1= np.array([1.9,0.0]).reshape(2,1)
a2= np.array([2.0,0.1]).reshape(2,1)
a3= np.array([2.5,1.0]).reshape(2,1)
a4= np.array([0.1,1.9]).reshape(2,1)
a5= np.array([0.2,2.1]).reshape(2,1)
a6= np.array([0.0,1.5]).reshape(2,1)
A = np.concatenate([a1,a2,a3,a4,a5,a6],axis=1)
A

array([[1.9, 2. , 2.5, 0.1, 0.2, 0. ],
       [0. , 0.1, 1. , 1.9, 2.1, 1.5]])

b1= np.array([2.0,0.1]).reshape(2,1)
b2= np.array([1.9,0.2]).reshape(2,1)
b3= np.array([1.8,0.3]).reshape(2,1)
b4= np.array([0.3,2.1]).reshape(2,1)
b5= np.array([0.2,2.0]).reshape(2,1)
b6= np.array([0.1,1.9]).reshape(2,1)
B = np.concatenate([b1,b2,b3,b4,b5,b6],axis=1)
B

array([[2. , 1.9, 1.8, 0.3, 0.2, 0.1],
       [0.1, 0.2, 0.3, 2.1, 2. , 1.9]])

A.T@B

array([[3.8 , 3.61, 3.42, 0.57, 0.38, 0.19],
       [4.01, 3.82, 3.63, 0.81, 0.6 , 0.39],
       [5.1 , 4.95, 4.8 , 2.85, 2.5 , 2.15],
       [0.39, 0.57, 0.75, 4.02, 3.82, 3.62],
       [0.61, 0.8 , 0.99, 4.47, 4.24, 4.01],
       [0.15, 0.3 , 0.45, 3.15, 3.  , 2.85]])

a1.T@b1, a2.T@b2, a3.T@b1

(array([[3.8]]), array([[3.82]]), array([[5.1]]))

결국 모형은 아래와 같다.

\[\text{궁합매트릭스} = {\bf A}^\top {\bf B} + \text{오차}\]

- 학습전략: 아래의 매트릭스중에서 어떤값은 관측하였고 어떤값은 관측하지 못함 \(\to\) 관측한 값들만 대충 비슷하게 하면 되는거 아니야?

A.T@B

array([[3.8 , 3.61, 3.42, 0.57, 0.38, 0.19],
       [4.01, 3.82, 3.63, 0.81, 0.6 , 0.39],
       [5.1 , 4.95, 4.8 , 2.85, 2.5 , 2.15],
       [0.39, 0.57, 0.75, 4.02, 3.82, 3.62],
       [0.61, 0.8 , 0.99, 4.47, 4.24, 4.01],
       [0.15, 0.3 , 0.45, 3.15, 3.  , 2.85]])

df_view

	영식	영철	영호	광수	상철	영수
옥순	3.9	4.1	NaN	0.5	0.3	NaN
영자	4.5	NaN	3.7	0.5	NaN	0.2
정숙	NaN	4.9	4.7	NaN	1.2	1.3
영숙	0.6	0.2	NaN	4.1	4.3	NaN
순자	0.7	0.9	NaN	4.2	NaN	3.9
현숙	NaN	0.2	0.3	NaN	3.5	3.4

- 자료를 아래와 같이 정리한다면?

df = pd.DataFrame([(f,m,df_view.loc[f,m]) for f in df_view.index for m in df_view.columns if not np.isnan(df_view.loc[f,m])])
df.columns = ['X1','X2','y']
df

	X1	X2	y
0	옥순	영식	3.9
1	옥순	영철	4.1
2	옥순	광수	0.5
3	옥순	상철	0.3
4	영자	영식	4.5
5	영자	영호	3.7
6	영자	광수	0.5
7	영자	영수	0.2
8	정숙	영철	4.9
9	정숙	영호	4.7
10	정숙	상철	1.2
11	정숙	영수	1.3
12	영숙	영식	0.6
13	영숙	영철	0.2
14	영숙	광수	4.1
15	영숙	상철	4.3
16	순자	영식	0.7
17	순자	영철	0.9
18	순자	광수	4.2
19	순자	영수	3.9
20	현숙	영철	0.2
21	현숙	영호	0.3
22	현숙	상철	3.5
23	현숙	영수	3.4

mapp1 = {k[1]:k[0] for k in enumerate(df.X1.unique())}
mapp2 = {k[1]:k[0] for k in enumerate(df.X2.unique())}
mapp1,mapp2

({'옥순': 0, '영자': 1, '정숙': 2, '영숙': 3, '순자': 4, '현숙': 5},
 {'영식': 0, '영철': 1, '광수': 2, '상철': 3, '영호': 4, '영수': 5})

X1 = torch.tensor(list(map(lambda name: mapp1[name], df.X1)))
X2 = torch.tensor(list(map(lambda name: mapp2[name], df.X2)))
X1 = torch.nn.functional.one_hot(X1).float()
X2 = torch.nn.functional.one_hot(X2).float()
y = torch.tensor(df.y).float()

X1

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]])

- yhat을 구하는 과정..

l1 = torch.nn.Linear(in_features=6,out_features=2) # I성향 E성향.. #여출 
l2 = torch.nn.Linear(in_features=6,out_features=2) # 남출

l1(X1) # 옥순~현숙의 성향들

tensor([[-0.1484,  0.1981],
        [-0.1484,  0.1981],
        [-0.1484,  0.1981],
        [-0.1484,  0.1981],
        [-0.1659,  0.7817],
        [-0.1659,  0.7817],
        [-0.1659,  0.7817],
        [-0.1659,  0.7817],
        [ 0.1933,  0.7089],
        [ 0.1933,  0.7089],
        [ 0.1933,  0.7089],
        [ 0.1933,  0.7089],
        [ 0.0649,  0.3645],
        [ 0.0649,  0.3645],
        [ 0.0649,  0.3645],
        [ 0.0649,  0.3645],
        [ 0.3438,  0.2501],
        [ 0.3438,  0.2501],
        [ 0.3438,  0.2501],
        [ 0.3438,  0.2501],
        [-0.2503,  0.4676],
        [-0.2503,  0.4676],
        [-0.2503,  0.4676],
        [-0.2503,  0.4676]], grad_fn=<AddmmBackward0>)

l2(X2) # 영식~영수의 성향들

tensor([[ 0.2230,  0.3115],
        [-0.1752, -0.0627],
        [ 0.2852,  0.4847],
        [-0.2893,  0.2159],
        [ 0.2230,  0.3115],
        [ 0.1466, -0.0453],
        [ 0.2852,  0.4847],
        [-0.4553,  0.3573],
        [-0.1752, -0.0627],
        [ 0.1466, -0.0453],
        [-0.2893,  0.2159],
        [-0.4553,  0.3573],
        [ 0.2230,  0.3115],
        [-0.1752, -0.0627],
        [ 0.2852,  0.4847],
        [-0.2893,  0.2159],
        [ 0.2230,  0.3115],
        [-0.1752, -0.0627],
        [ 0.2852,  0.4847],
        [-0.4553,  0.3573],
        [-0.1752, -0.0627],
        [ 0.1466, -0.0453],
        [-0.2893,  0.2159],
        [-0.4553,  0.3573]], grad_fn=<AddmmBackward0>)

- 몇개의 관측치만 생각해보자..

df.head()

	X1	X2	y
0	옥순	영식	3.9
1	옥순	영철	4.1
2	옥순	광수	0.5
3	옥순	상철	0.3
4	영자	영식	4.5

(l1(X1)[0]*l2(X2)[0]).sum() # (옥순의성향 * 영식의성향).sum()

tensor(0.0286, grad_fn=<SumBackward0>)

이 값이 실제로는 3.9 이어야 한다.

(l1(X1)[1]*l2(X2)[1]).sum() # (옥순의성향 * 영철의성향).sum()

tensor(0.0136, grad_fn=<SumBackward0>)

이 값이 실제로는 4.1 이어야 한다.

- yhat을 구하면!

yhat = (l1(X1) * l2(X2)).sum(axis=1) # (l1(X1) * l2(X2)).sum(1)와 결과가 같음 
yhat

tensor([ 0.0286,  0.0136,  0.0537,  0.0857,  0.2065, -0.0597,  0.3316,  0.3548,
        -0.0783, -0.0038,  0.0971,  0.1652,  0.1280, -0.0342,  0.1952,  0.0599,
         0.1546, -0.0759,  0.2193, -0.0672,  0.0145, -0.0579,  0.1734,  0.2810],
       grad_fn=<SumBackward1>)

yhat[:2],y[:2] # 이 값들이 비슷해야 하는데..

(tensor([0.0286, 0.0136], grad_fn=<SliceBackward0>), tensor([3.9000, 4.1000]))

- 0~5 까지의 범위로 고정되어 있으니까 아래와 같이 해도 되겠음..

sig = torch.nn.Sigmoid() # range: 0~1

yhat = sig((l1(X1) * l2(X2)).sum(axis=1))*5 # (l1(X1) * l2(X2)).sum(1)와 결과가 같음    #range: 0~5
yhat

tensor([2.5357, 2.5170, 2.5671, 2.6071, 2.7572, 2.4254, 2.9108, 2.9389, 2.4021,
        2.4953, 2.6213, 2.7061, 2.6598, 2.4572, 2.7432, 2.5749, 2.6928, 2.4052,
        2.7730, 2.4161, 2.5182, 2.4277, 2.7162, 2.8490],
       grad_fn=<MulBackward0>)

loss = torch.mean((y-yhat)**2)
loss

tensor(3.4368, grad_fn=<MeanBackward0>)

torch를 이용한 학습

torch.manual_seed(43052)
l1 = torch.nn.Linear(6,2) 
l2 = torch.nn.Linear(6,2)
sig = torch.nn.Sigmoid()

loss_fn = torch.nn.MSELoss() 
optimizr = torch.optim.Adam(list(l1.parameters())+list(l2.parameters()))

for epoc in range(5000):
    ## 1 
    feature1 = l1(X1)
    feature2 = l2(X2) 
    matching_score = (feature1*feature2).sum(axis=1) 
    yhat = sig(matching_score)*5 # 만약에 1~3점이라면 "1+sig(matching_score)*2" 와 같이 하면 되었을듯 
    ## 2 
    loss = loss_fn(yhat,y)    
    ## 3 
    loss.backward()    
    ## 4 
    optimizr.step()
    optimizr.zero_grad()

yhat

tensor([3.9382, 4.0624, 0.4665, 0.3353, 4.5038, 3.6975, 0.3562, 0.3558, 4.8614,
        4.7208, 1.1813, 1.3158, 0.4606, 0.3573, 4.1288, 4.2734, 0.8611, 0.7347,
        4.0493, 4.0464, 0.1810, 0.3124, 3.5031, 3.3948],
       grad_fn=<MulBackward0>)

tensor([3.9000, 4.1000, 0.5000, 0.3000, 4.5000, 3.7000, 0.5000, 0.2000, 4.9000,
        4.7000, 1.2000, 1.3000, 0.6000, 0.2000, 4.1000, 4.3000, 0.7000, 0.9000,
        4.2000, 3.9000, 0.2000, 0.3000, 3.5000, 3.4000])

l1(X1) # 두번째 칼럼이 I 성향 점수로 "해석"된다

tensor([[-1.4663,  0.2938],
        [-1.4663,  0.2938],
        [-1.4663,  0.2938],
        [-1.4663,  0.2938],
        [-1.7086,  0.6597],
        [-1.7086,  0.6597],
        [-1.7086,  0.6597],
        [-1.7086,  0.6597],
        [-0.8705,  1.2945],
        [-0.8705,  1.2945],
        [-0.8705,  1.2945],
        [-0.8705,  1.2945],
        [ 1.1046, -0.8298],
        [ 1.1046, -0.8298],
        [ 1.1046, -0.8298],
        [ 1.1046, -0.8298],
        [ 0.9880, -0.5193],
        [ 0.9880, -0.5193],
        [ 0.9880, -0.5193],
        [ 0.9880, -0.5193],
        [ 0.6834, -1.2201],
        [ 0.6834, -1.2201],
        [ 0.6834, -1.2201],
        [ 0.6834, -1.2201]], grad_fn=<AddmmBackward0>)

포인트: 여성출연자중, 정숙은 대체로 잘 맞춰주고 현숙은 그렇지 않았음.. \(\to\) 그러한 가중치가 잘 드러남!!

fastai를 이용한 학습

(1) dls

df.head() # 앞단계 전처리의 산물

	X1	X2	y
0	옥순	영식	3.9
1	옥순	영철	4.1
2	옥순	광수	0.5
3	옥순	상철	0.3
4	영자	영식	4.5

dls = CollabDataLoaders.from_df(df,bs=2,valid_pct=2/24) #bs:배치사이즈

(2) lrnr 생성

lrnr = collab_learner(dls,n_factors=2,y_range=(0,5))

(3) 학습

lrnr.fit(30,lr=0.05)

epoch	train_loss	valid_loss	time
0	0.005521	0.306862	00:00
1	0.006144	0.246958	00:00
2	0.006997	0.300838	00:00
3	0.009465	0.193282	00:00
4	0.011386	0.157935	00:00
5	0.011837	0.273318	00:00
6	0.011834	0.170711	00:00
7	0.011649	0.245928	00:00
8	0.012505	0.198697	00:00
9	0.014821	0.153817	00:00
10	0.012487	0.144184	00:00
11	0.011637	0.164051	00:00
12	0.011798	0.189932	00:00
13	0.012036	0.163537	00:00
14	0.012818	0.203912	00:00
15	0.017325	0.210955	00:00
16	0.024745	0.143737	00:00
17	0.025496	0.172830	00:00
18	0.025869	0.138098	00:00
19	0.025482	0.151525	00:00
20	0.027537	0.193854	00:00
21	0.024163	0.109432	00:00
22	0.020186	0.167370	00:00
23	0.017565	0.107690	00:00
24	0.015754	0.160082	00:00
25	0.013752	0.115723	00:00
26	0.012612	0.105396	00:00
27	0.011966	0.094555	00:00
28	0.014367	0.162134	00:00
29	0.013150	0.175142	00:00

(4) 예측

적합값 확인

lrnr.show_results()

	X1	X2	y	y_pred
0	1.0	3.0	3.9	3.740652
1	6.0	2.0	3.5	4.069994

(옥순의 궁합)

df_new = pd.DataFrame({'X1':['옥순']*6, 'X2':['영식','영철','영호','광수','상철','영수']})
df_new

	X1	X2
0	옥순	영식
1	옥순	영철
2	옥순	영호
3	옥순	광수
4	옥순	상철
5	옥순	영수

lrnr.get_preds(dl=dls.test_dl(df_new))

(tensor([3.9063, 4.1200, 3.2875, 0.5278, 0.1878, 0.3123]), None)

비교를 위해서

df_view

	영식	영철	영호	광수	상철	영수
옥순	3.9	4.1	NaN	0.5	0.3	NaN
영자	4.5	NaN	3.7	0.5	NaN	0.2
정숙	NaN	4.9	4.7	NaN	1.2	1.3
영숙	0.6	0.2	NaN	4.1	4.3	NaN
순자	0.7	0.9	NaN	4.2	NaN	3.9
현숙	NaN	0.2	0.3	NaN	3.5	3.4

(정숙의 궁합)

df_new = pd.DataFrame({'X1':['정숙']*6, 'X2':['영식','영철','영호','광수','상철','영수']})
df_new

	X1	X2
0	정숙	영식
1	정숙	영철
2	정숙	영호
3	정숙	광수
4	정숙	상철
5	정숙	영수

lrnr.get_preds(dl=dls.test_dl(df_new))

(tensor([4.7749, 4.8766, 4.7028, 1.7205, 0.5784, 1.1272]), None)

비교를 위해서

df_view

	영식	영철	영호	광수	상철	영수
옥순	3.9	4.1	NaN	0.5	0.3	NaN
영자	4.5	NaN	3.7	0.5	NaN	0.2
정숙	NaN	4.9	4.7	NaN	1.2	1.3
영숙	0.6	0.2	NaN	4.1	4.3	NaN
순자	0.7	0.9	NaN	4.2	NaN	3.9
현숙	NaN	0.2	0.3	NaN	3.5	3.4

- Appedix: fastai 구조공부..

lrnr.model

EmbeddingDotBias(
  (u_weight): Embedding(7, 2)
  (i_weight): Embedding(7, 2)
  (u_bias): Embedding(7, 1)
  (i_bias): Embedding(7, 1)
)

lrnr.model.forward??

Signature: lrnr.model.forward(x)
Docstring:
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
    Although the recipe for forward pass needs to be defined within
    this function, one should call the :class:`Module` instance afterwards
    instead of this since the former takes care of running the
    registered hooks while the latter silently ignores them.
Source:   
    def forward(self, x):
        users,items = x[:,0],x[:,1]
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]
File:      ~/anaconda3/envs/py37/lib/python3.7/site-packages/fastai/collab.py
Type:      method

bias를 제외하면 우리가 짠 모형과 같음!

커피 or 홍차

data

- 예전에 살펴본 예제

df = pd.read_csv('https://raw.githubusercontent.com/guebin/DL2022/main/posts/I.%20Overview/2022-09-08-rcmd_anal.csv')
df

	user	item	rating	item_name
0	1	15	1.084308	홍차5
1	1	1	4.149209	커피1
2	1	11	1.142659	홍차1
3	1	5	4.033415	커피5
4	1	4	4.078139	커피4
...	...	...	...	...
995	100	18	4.104276	홍차8
996	100	17	4.164773	홍차7
997	100	14	4.026915	홍차4
998	100	4	0.838720	커피4
999	100	7	1.094826	커피7

1000 rows × 4 columns

- 기억을 살리기 위해서..

df_view = pd.read_csv('https://raw.githubusercontent.com/guebin/DL2022/main/posts/I.%20Overview/2022-09-08-rcmd_view.csv')
df_view

	커피1	커피2	커피3	커피4	커피5	커피6	커피7	커피8	커피9	커피10	홍차1	홍차2	홍차3	홍차4	홍차5	홍차6	홍차7	홍차8	홍차9	홍차10
0	4.149209	NaN	NaN	4.078139	4.033415	4.071871	NaN	NaN	NaN	NaN	1.142659	1.109452	NaN	0.603118	1.084308	NaN	0.906524	NaN	NaN	0.903826
1	4.031811	NaN	NaN	3.822704	NaN	NaN	NaN	4.071410	3.996206	NaN	NaN	0.839565	1.011315	NaN	1.120552	0.911340	NaN	0.860954	0.871482	NaN
2	4.082178	4.196436	NaN	3.956876	NaN	NaN	NaN	4.450931	3.972090	NaN	NaN	NaN	NaN	0.983838	NaN	0.918576	1.206796	0.913116	NaN	0.956194
3	NaN	4.000621	3.895570	NaN	3.838781	3.967183	NaN	NaN	NaN	4.105741	1.147554	NaN	1.346860	NaN	0.614099	1.297301	NaN	NaN	NaN	1.147545
4	NaN	NaN	NaN	NaN	3.888208	NaN	3.970330	3.979490	NaN	4.010982	NaN	0.920995	1.081111	0.999345	NaN	1.195183	NaN	0.818332	1.236331	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	0.511905	1.066144	NaN	1.315430	NaN	1.285778	NaN	0.678400	1.023020	0.886803	NaN	4.055996	NaN	NaN	4.156489	4.127622	NaN	NaN	NaN	NaN
96	NaN	1.035022	NaN	1.085834	NaN	0.812558	NaN	1.074543	NaN	0.852806	3.894772	NaN	4.071385	3.935935	NaN	NaN	3.989815	NaN	NaN	4.267142
97	NaN	1.115511	NaN	1.101395	0.878614	NaN	NaN	NaN	1.329319	NaN	4.125190	NaN	4.354638	3.811209	4.144648	NaN	NaN	4.116915	3.887823	NaN
98	NaN	0.850794	NaN	NaN	0.927884	0.669895	NaN	NaN	0.665429	1.387329	NaN	NaN	4.329404	4.111706	3.960197	NaN	NaN	NaN	3.725288	4.122072
99	NaN	NaN	1.413968	0.838720	NaN	NaN	1.094826	0.987888	NaN	1.177387	3.957383	4.136731	NaN	4.026915	NaN	NaN	4.164773	4.104276	NaN	NaN

100 rows × 20 columns

모형

(편의상 바이어스를 제외하면)

- 특징벡터:

유저1의 취향 = [커피를 좋아하는 정도, 홍차를 좋아하는 정도]
아이템1의 특징 = [커피의 특징, 홍차인 특징]

- 평점

유저1이 아이템1을 먹었을경우 평점: 유저1의 취향과 아이템1의 특징의 내적 = (유저1의 취향 \(\odot\) 아이템1의 특징).sum()

학습

(1) dls

dls = CollabDataLoaders.from_df(df)

dls.items

	user	item	rating	item_name
192	20	1	3.933610	커피1
794	80	12	4.125577	홍차2
554	56	17	3.826543	홍차7
524	53	3	1.170372	커피3
175	18	10	4.170460	커피10
...	...	...	...	...
896	90	12	4.391382	홍차2
849	85	3	0.693932	커피3
746	75	12	4.301711	홍차2
787	79	14	3.930048	홍차4
100	11	20	1.145191	홍차10

800 rows × 4 columns

(2) lrnr

lrnr = collab_learner(dls,n_factors=2) # 교재에는 y_range 를 설정하도록 되어있지만 설정 안해도 적합에는 크게 상관없음..

(3) fit

lrnr.fit(10,0.1)

epoch	train_loss	valid_loss	time
0	5.409556	3.313535	00:00
1	3.724468	2.569444	00:00
2	2.855262	1.630986	00:00
3	2.051633	0.469137	00:00
4	1.483525	0.264474	00:00
5	1.096932	0.178709	00:00
6	0.824759	0.117894	00:00
7	0.630313	0.081575	00:00
8	0.487037	0.076569	00:00
9	0.380992	0.076578	00:00

(4) predict

(적합된 값 확인)

lrnr.show_results() # 누를때마다 결과다름

	user	item	rating	rating_pred
0	61.0	19.0	4.160296	4.037053
1	22.0	4.0	4.192549	3.940574
2	17.0	17.0	1.096392	0.967445
3	14.0	4.0	3.826174	4.002016
4	88.0	5.0	1.197540	0.968678
5	53.0	15.0	3.859582	3.966616
6	83.0	5.0	0.752025	0.789191
7	10.0	11.0	0.676153	0.978221
8	46.0	17.0	0.833476	0.908008

(예측값)

df_new = pd.DataFrame({'user':[1,1,1,1], 'item':[9,10,11,12]})
df_new

	user	item
0	1	9
1	1	10
2	1	11
3	1	12

lrnr.get_preds(dl=dls.test_dl(df_new))

(tensor([4.0201, 4.0401, 0.9940, 0.8291]), None)