首先给出论文的地址:Deep Interest Network for Click-Through Rate Prediction f, n) q8 e4 \0 ]
然后给出两篇对论文进行了详细介绍的文章:; h- u1 O) G2 X; f4 k% z! ^* \
王喆:推荐系统中的注意力机制——阿里深度兴趣网络(DIN)) h/ w9 ~3 {/ w1 [8 O/ G
梁勇:推荐系统遇上深度学习(十八)-探秘阿里深度兴趣网络浅析及实现
# K: C* o0 S& [: s! `" B建议先读完上面两篇文章,对模型有大概的了解之后再来读本篇文章。本文主要从代码的层面出发,逐行去分析理解论文中思想的具体实现。
6 e/ b- q: g* U本文使用的代码地址为:zhougr1993/DeepInterestNetwork X1 H* E! q# `" m
开始!1 ^7 f7 y. W! m; L
源码介绍- H! [& h; E) [2 n
5 A: d2 X, N: ~7 U7 {; C3 r- S) n" |5 u8 F& D6 `* G L" e+ L% U @& o
% }& @4 a" [" L* \% ]+ `/ h( |
论 文 中 用 的 是 Amazon Product Data 数 据 , 包 含 两 个 文 件 reviews_Electronics_5.json和 meta_Electronics.json。具体数据形式在本文最后有展 示。文件格式链接中有说明,其中 reviews 主要是用户买了相关商品产生的上下 文信息,包括商品 id, 时间,评论等。meta 文件是关于商品本身的信息,包括商 品 id, 名称,类别,买了还买等信息。其中 meta 部分为商品信息,reviewer 为点 击者信息,转换为 dataframe 格式之后的具体字段如下:$ ^6 L7 G3 }& |: S T
meta_df: ['asin','imUrl','description','categories','title','price','salesRank','related','brand'] ; k% p% w! p. _& C
reviews_df: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText','overall', 'summary', 'unixReviewTime','reviewTime']* `$ W: `% Q6 k0 ]3 t
源码解析! h( a6 V9 l. @. {8 \% l
' V% U4 S: O$ s" U+ D" r7 v一、数据预处理9 N% m, x* X8 w) C2 H) y7 [( }
+ }0 V$ T9 A0 |# V1 L+ u
5 Z) q2 z+ f2 H e
二、生成正负样本
: I: e4 _0 t ~1 P4 x
) L0 ~: P8 a1 K1 I
2 u: G/ t. Q$ n
build_dataset.py代码详解如下:: q5 |9 y+ l5 v! q
import random! e( C5 V7 c5 @( x& q
import pickle5 W( A+ R. e7 e0 F+ H
random.seed(1234)9 j* r: z" b: f% [5 O8 `
withopen('C:/Users/Administrator/Desktop/raw_data/remap.pkl','rb')asf:% x6 r5 u+ S& B
reviews_df=pickle.load(f)
( V* L, Z) B4 ]* O2 s cate_list=pickle.load(f)
' N2 H' E# j; x. z/ B# h user_count,item_count,cate_count,example_count=pickle.load(f)9 i4 z3 q2 i3 H
'''
9 M5 Q# f$ w+ ^$ \pos_list(每个点击者点击的商品 ID 组成的 list)例如: [8] [9,6,4,5] [3] [8] ) T! C7 S# J- H* S+ E7 K# D; T
'''
) {, G p) \' dtrain_set=[] 3 L8 Y# L9 J7 d3 l9 T; C
test_set=[]
7 I1 s. ]+ a& H+ D3 vfor reviewerID,histinreviews_df.groupby('reviewerID'):
/ l- F, O8 U4 |$ ~% h pos_list=hist['asin'].tolist()
; F6 g8 C" b7 p1 I/ l5 v defgen_neg():
; ?% L* a3 J) C: T #取每个用户点击列表的第一个商品 0 `' C; O1 v$ c5 g9 F
neg=pos_list[0]
3 O: p f- P# L. J while neg in pos_list:
: _8 M/ k% S4 B8 O8 q #随机初始化,即给点击者随机初始化一个商品,item_count-1 为商品数
' P3 ]% V, ]. ~% e neg=random.randint(0,item_count-1)
0 z9 t* q) r3 r) K4 ~) i return neg
+ g+ r8 z. X; }& W# b/ ` neg_list=[gen_neg() for i in range(len(pos_list))]
8 _ a/ E! C4 {
$ s* i' h1 z# b4 U8 `& H#如果用户点击的商品数大于 1,则循环 5 ^# u- F/ h7 x* D4 z
for i in range(1,len(pos_list)):
8 N# s8 T* D+ C& N, `. j" C- W hist=pos_list[:i] . f" M: Q' B$ B [& f
#print(hist)
8 }8 f X8 h: W9 y2 j& v! A! W ''' s' X* A0 i# I
#下面的 if 语句控制正负样本的个数和格式),例如某用户点击过 abcd 四 个商品,7 K+ W( |6 X ]; y
则最终生成的样本为:(其中 X 为随机初始化的某商品 ID) - p- C2 ]) _, u. R4 W# @
((user_id,a,(b,1)) (user_id,a,(X,0)) (user_id,(a,b),(c,1))
& b& [! [' m+ b# W, ^; g user_id,(a,b),(X0)) (user_id,(a,b,c),(d,1)) (user_id,(a,b,c),(X,0)) + F0 i1 X3 S* P
'''* D5 ^) r& ?' X3 a
if i != len(pos_list) - 1: - T9 M' Q3 h2 k! D
train_set.append((reviewerID,hist,pos_list,1))
: Y* q' L( [+ g" Q! { print(train_set)
! w) p2 C! Z: Z2 a& F4 a0 q train_set.append((reviewerID,hist,neg_list,0))
6 m, J1 L. U4 l/ ` print(train_set) 5 m5 [3 ?. ?4 f/ O3 v# ~/ ]9 q+ Y
#验证集格式(user_id,a,(b,X))
( k$ x8 d) G1 h. S6 \$ h else: ; A4 o0 }$ N- W3 o' \
label=(pos_list,neg_list) " k5 P1 e, D9 o! a' t
print(label)
4 C7 h( x2 y# ~# m test_set.append((reviewerID,hist,label)) 2 B( b& c ?# Y& l) B
print(test_set)
$ W9 T" y. [- L+ _8 N6 m! f, N) u- Y( ?
#最终的数据集里点击商品数小于 1 的数据删除掉了
9 W5 {5 _$ \$ } n3 s. X* Jrandom.shuffle(train_set) ( Q- W: l0 s, E6 y: r' l8 d" E( \
random.shuffle(test_set)
5 f- H2 x$ \4 T' y1 G' p9 o vassertlen(test_set)==user_count
G7 O& ?4 P, k: i/ q; d+ T9 I$ ?6 E$ E$ S+ V
with open('dataset.pkl','wb') as f:
( u2 P# _1 P1 @+ @pickle.dump(train_set,f,pickle.HIGHEST_PROTOCOL)
; b" C- ~9 U; I2 gpickle.dump(test_set,f,pickle.HIGHEST_PROTOCOL)
/ d* Z r6 F `- J& Bpickle.dump(cate_list,f,pickle.HIGHEST_PROTOCOL) - z; v! K$ B- ]* V7 M1 a; M' m
pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)三、模型部分(包括attention机制实现)
% l+ S9 O; M+ Z3 X1 z4 g l* I" r3 p0 d( m+ }* R
下面介绍本算法的核心model.py文件,每行关键的代码都给出了注释。( J+ D% ?, S0 @- \- D
import tensorflow as tf
" S( ~0 R W; c, w) }: w3 l( tfrom Dice import dice
$ w- x. m p/ V4 | D
: {( _/ y+ ]- l/ X @( ^; m" q8 A' oclass Model(object):' r, [ ]$ A' h: u
def __init__(self,user_count,item_count,cate_count,cate_list):
* t) W: s* {8 X) H # shape: [B], user id。 (B:batch size)
4 t) @$ @- i- r2 X4 H8 F self.u = tf.placeholder(tf.int32, [None, ])& a% V$ ]% u- D5 I$ c9 _
# shape: [B] i: 正样本的item
' R" O7 o: C6 ]5 {7 E self.i = tf.placeholder(tf.int32, [None, ])
& \ f6 A5 i) j # shape: [B] j: 负样本的item. Y: O) { ?( C# r V# v
self.j = tf.placeholder(tf.int32, [None, ]); t# {. S8 g9 O
# shape: [B], y: label& X0 R) a/ R) K+ Y& H3 O
self.y = tf.placeholder(tf.float32, [None, ])
; U. \* q. z8 U: E4 j# X/ A # shape: [B, T] #用户行为特征(User Behavior)中的item序列。T为序列长度
6 F0 D# X6 l0 Q7 c* W4 H self.hist_i = tf.placeholder(tf.int32, [None, None])9 W9 s8 y8 |- w z) R3 O; a
# shape: [B]; sl:sequence length,User Behavior中序列的真实序列长度(?)
' ~- D/ ]! B0 o4 L( b self.sl = tf.placeholder(tf.int32, [None, ])
- a- q! I! \: E X3 M3 p #learning rate- c g: u$ i& Y1 p' S3 }7 c5 `
self.lr = tf.placeholder(tf.float64, [])% B4 K5 ^$ _: u" q4 Q' |+ F
$ c" Z: Y# i* v j hidden_units = 1281 H: w& M! U0 ^- c! j4 H. g
# shape: [U, H], user_id的embedding weight. U是user_id的hash bucket size
) j/ o$ ^: z. O- s user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units])
3 Y6 ]- i6 Y! |! \1 N; F
& o" Z3 K( u' c0 k' d8 v8 d: m6 {. T/ L # shape: [I, H//2], item_id的embedding weight. I是item_id的hash bucket size
9 }5 B4 T' D3 z" H! L$ a& s item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) # [I, H//2]
& R6 |7 f; U2 W6 h" s+ D
4 _2 A6 [$ v5 k # shape: [I], bias
, Y4 [) W( r3 B% L item_b = tf.get_variable("item_b", [item_count],initializer=tf.constant_initializer(0.0))
0 C' t) l) v" _: q, A2 } # shape: [C, H//2], cate_id的embedding weight.( P) Z7 J' }; z7 Z: J- [
cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])8 b3 r) n, Y6 K4 B5 g$ x/ l
: B4 w4 a- |8 r( o. ~( f
# shape: [C, H//2]
$ {; ~5 |8 C1 J" u cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)) Q9 z9 k8 w+ C
; V) w; z7 c5 ?5 L- W8 f* M # 从cate_list中取出正样本的cate
: f. X- ]! C7 h/ t' _- U ic = tf.gather(cate_list, self.i)! F7 ]' }7 a5 e( e/ y' v& T
# 正样本的embedding,正样本包括item和cate
& q( z$ m) j0 p+ y- R/ o+ H& h i_emb = tf.concat(values=[tf.nn.embedding_lookup(item_emb_w, self.i),tf.nn.embedding_lookup(cate_emb_w, ic),], axis=1)
) f' s% |8 \, m2 q% _ # 偏置b
9 b; T) j4 K6 g4 I) V8 I& i; S i_b = tf.gather(item_b, self.i)' c0 x& I) ~. V9 g: J4 m
9 S M/ S3 S7 B2 s& V$ m
# 从cate_list中取出负样本的cate3 a: C: B+ h# H' [( N& Z2 u/ e
jc = tf.gather(cate_list, self.j)$ Y7 o8 [8 C9 I5 e: ~% k$ G& P. E
# 负样本的embedding,负样本包括item和cate
& L# _7 P* m8 @/ s1 H8 f) ^ j_emb = tf.concat([tf.nn.embedding_lookup(item_emb_w, self.j),tf.nn.embedding_lookup(cate_emb_w, jc),], axis=1)
7 M9 q s, n; s. t4 h+ B5 t # 偏置b8 d% G1 d+ V) ]- h- X* _. w
j_b = tf.gather(item_b, self.j)
3 O6 {2 m( l% ]0 a
: h* _9 q5 @2 |/ i # 用户行为序列(User Behavior)中的cate序列
. |' k/ ]' B7 t0 w# O4 V hc = tf.gather(cate_list, self.hist_i)5 p% H$ L( l' y
/ i9 Y+ F. y Y # 用户行为序列(User Behavior)的embedding,包括item序列和cate序列1 B7 l; |# P, n% c" T
h_emb = tf.concat([tf.nn.embedding_lookup(item_emb_w, self.hist_i),tf.nn.embedding_lookup(cate_emb_w, hc),], axis=2)1 p# k. x4 E1 b" r0 Y* N3 ?
# attention操作9 C" S' T) \% p- P
hist_i = attention(i_emb, h_emb, self.sl)
" f9 x4 w% ~: `* a7 E # -- attention end ---
; \4 E6 r1 J' ?& V1 m) `0 J1 o. A. F& h! t3 P* F
hist = tf.layers.batch_normalization(inputs=hist)6 E# a1 x) S, b9 ?/ Z6 C
hist = tf.reshape(hist,[-1,hidden_units])
% H, z) p( d& D; A& p) P. D #添加一层全连接层,hist为输入,hidden_units为输出维数' {. l4 G' u, r" G4 X, M
hist = tf.layers.dense(hist,hidden_units)
E1 H/ F6 m8 V2 J$ s
7 q" a! k) d& F2 F% \ u_emb = hist
2 _1 Y; ~% J9 O3 J% b7 Z5 b: C* i
Q7 x1 P& p& a6 V2 b$ v #下面两个全连接用来计算y',i为正样本,j为负样本8 Q0 M6 p! _- D
# fcn begin
$ k; ~: e" j; w din_i = tf.concat([u_emb, i_emb], axis=-1)* c5 Z3 \) [- O& z
din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
6 i0 r& P6 V4 [0 T d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')7 G4 V0 t. S! T# [/ v3 F! p, {/ I9 G
d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')( C: n6 O# y/ _" s+ h
d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
; j; i: G, H! } d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
5 H- R' Z6 O" h& G3 m d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
/ W! l* k8 J j' l6 {, O* u- V$ {# Q {% ]
din_j = tf.concat([u_emb, j_emb], axis=-1)
, X% P/ M/ G; E6 m: K din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
( x7 T. _$ `+ ]' q3 s# Y) ?$ k d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
5 ]1 X$ y m+ v) U d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
7 }/ _$ {- U8 |, U& e4 `5 I d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
5 B, m4 m( m: Y, `' j d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
# f) M" V& m8 t d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
, V, Z$ y2 p# a5 q# J0 X- `( [6 ]# S" A; x& U# j
d_layer_3_i = tf.reshape(d_layer_3_i, [-1])% [: N6 T- Z" C8 q6 l$ V
d_layer_3_j = tf.reshape(d_layer_3_j, [-1])9 g% [3 m5 d( N9 _$ }+ w- L
4 V8 u- f' I: N- @4 @# Q- e1 y #预测的(y正-y负)
& f5 _8 K: s$ n0 J* r( U" g x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
0 D: Q3 ~1 W5 q: t3 m) _: q #预测的(y正)6 u; b: v, H+ u, Z
self.logits = i_b + d_layer_3_i
8 j6 L) A1 x; _
8 \" j, R% k7 i( o
& x7 H! V9 P" [0 P/ c+ y! k6 z # logits for all item:
8 D0 I$ W1 Z2 X3 Q u_emb_all = tf.expand_dims(u_emb, 1), S4 \( H4 T/ g1 @3 \6 D7 L
u_emb_all = tf.tile(u_emb_all, [1, item_count, 1])/ O7 V$ z# q/ K' O- E9 }/ i
#将所有的除u_emb_all外的embedding,concat到一起
! f+ e, p5 V! U9 D% D8 q( f) T all_emb = tf.concat([item_emb_w,tf.nn.embedding_lookup(cate_emb_w, cate_list)], axis=1)
4 U; L8 o8 e4 F# V7 P5 P5 r all_emb = tf.expand_dims(all_emb, 0)
( K7 I2 e1 L2 W9 y all_emb = tf.tile(all_emb, [512, 1, 1])- z* A! d: N* y9 y
# 将所有的embedding,concat到一起
. s/ a, P9 W' x4 Z8 c din_all = tf.concat([u_emb_all, all_emb], axis=-1)
! V. k% G' Q5 N1 d& w! w/ Z din_all = tf.layers.batch_normalization(inputs=din_all, name='b1', reuse=True)
/ M% U; g0 T* C8 X' X# \% p, M( M0 _/ d d_layer_1_all = tf.layers.dense(din_all, 80, activation=None, name='f1', reuse=True)& O: q' `5 @& J4 A8 h9 C5 Q9 A
d_layer_1_all = dice(d_layer_1_all, name='dice_1_all')
$ l6 G0 M; T0 s. O d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=None, name='f2', reuse=True)0 a% w, A3 A2 W2 v
d_layer_2_all = dice(d_layer_2_all, name='dice_2_all')
& w& H; {- l" ` u d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3', reuse=True)
* D3 j& h' s6 _4 o5 ~ T/ a9 c d_layer_3_all = tf.reshape(d_layer_3_all, [-1, item_count])
9 s; N; L/ V0 m3 H g. M" a* g/ b2 \9 d( v
self.logits_all = tf.sigmoid(item_b + d_layer_3_all)
0 U' h( s: p; |9 M+ C# |$ U # -- fcn end -------
; f1 y: Q* m/ p6 d7 }
1 d" \- X6 M% [0 W self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
( o( Z" g+ p8 a8 v @+ u self.score_i = tf.sigmoid(i_b + d_layer_3_i)( n6 M; p4 K/ \5 Q3 j
self.score_j = tf.sigmoid(j_b + d_layer_3_j)$ N4 w" n0 X* f! m
self.score_i = tf.reshape(self.score_i, [-1, 1])
8 W' L) G! s! a self.score_j = tf.reshape(self.score_j, [-1, 1])
( o& a3 c( ?# H2 T7 ~6 q( K self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
G6 w& @2 S1 n8 s: R1 E* g. Q1 V& W4 S) }% {
# Step variable, M: p% X& p2 S
self.global_step = tf.Variable(0, trainable=False, name='global_step')! y9 D7 P7 t( ~6 u3 c. P
self.global_epoch_step = tf.Variable(0, trainable=False, name='global_epoch_step')
) t. n- W c4 \" l0 U3 C* ?3 C self.global_epoch_step_op = tf.assign(self.global_epoch_step, self.global_epoch_step + 1)
+ I. u* }0 H, `( G' ?# P3 `
! q. T" l$ Y1 r, z5 A1 Y* |- V # loss and train3 X; e! S) D# |' p
self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,labels=self.y))
+ J+ a" ? S, \9 @6 C. e$ h trainable_params = tf.trainable_variables()
( {7 a k2 H& E self.train_op = tf.train.GradientDescentOptimizer(learning_rate=self.lr).minimize(self.loss)0 ?4 e. `" v& Q. Z7 S
c: `( s6 ^ m9 G2 k2 y5 U
def train(self,sess,uij,l):
2 H7 w4 @6 }1 x, T- D loss,_ = sess.run([self.loss,self.train_op],feed_dict={: L& Q# D1 C0 [' W1 a5 r
#self.u : uij[0],
1 ~- J3 R/ }- U9 F self.i : uij[1],- }" Z O9 H# o6 A" G4 n: c
self.y : uij[2],- W* k G' t% Q r
self.hist_i : uij[3],
2 X; q* p& `; E. z" ]& W self.sl : uij[4],$ X8 O- m( U2 ]+ h4 C3 b8 i0 l
self.lr : l
0 g j ?0 S n8 i })3 Z# h" k- t! d
return loss
' i* F8 q' o1 m; Z
0 `) c$ C I9 [ def eval(self, sess, uij):
! }* r( Z0 p1 I/ x4 W' i; v u_auc, socre_p_and_n = sess.run([self.mf_auc, self.p_and_n], feed_dict={
+ J0 i* c0 Y1 o1 f7 s #self.u: uij[0]," V/ L; t7 Q2 Q ?
self.i: uij[1],#正样本
: W. B/ W# J) @8 Z2 o$ X; x' o self.j: uij[2],#负样本; S# Q3 C. W: {$ t* h! i
self.hist_i: uij[3],$ [: a. {, v+ S1 B; M
self.sl: uij[4],/ t3 v5 V3 j: o. B5 d
})
) \2 l: T! x3 n: u0 V return u_auc, socre_p_and_n
( D3 m5 c. l1 i3 c3 r0 r* |) @7 B8 ]* M7 v' `! g \" K* m
def test(self, sess, uid, hist_i, sl):9 F: n2 q5 c- s
return sess.run(self.logits_all, feed_dict={* G1 X% M `1 [: B8 J7 m
self.u: uid,1 ~2 ~* i: v; s1 H5 Z, l! c2 P
self.hist_i: hist_i,
# h3 y! A- ^, a" W4 b* A. u$ ]5 | self.sl: sl,7 k) l. |: C3 G6 x4 ^
})
0 U; A; t' L+ h0 [( k9 L
5 }6 p6 G9 R( g2 a def save(self, sess, path):$ K8 Y! o L2 g; y6 D+ U K. u/ S
saver = tf.train.Saver(), G2 J! Y* Q; m5 V6 B
saver.save(sess, save_path=path)6 g6 C+ b) n# @0 Z5 B
$ }2 l1 y) X$ v2 H. i F2 q def restore(self, sess, path):2 I4 T6 n# U8 j+ ?6 [( P$ ~2 ^
saver = tf.train.Saver()
! \& \" y2 l9 c, k7 b) K saver.restore(sess, save_path=path)# P+ S+ }, x$ b( X/ X
$ J! Y7 }! k6 A5 G- Y. R
def extract_axis_1(data, ind):
8 B6 @0 d* d' N4 u batch_range = tf.range(tf.shape(data)[0])1 g2 d+ l% K% ~; [& w
indices = tf.stack([batch_range, ind], axis=1)/ y8 G) N b0 B+ u T ?9 v
res = tf.gather_nd(data, indices)
5 ~1 X: E: b2 }* {6 b1 g# s return res
! a- q6 ^: j) A6 P5 D! v: M/ J* I! `9 H4 G2 B6 Y3 \4 y7 Q
#item_embedding,history_behivior_embedding,sequence_length8 t/ d% u* V6 a, r
def attention(queries,keys,keys_length):
: x! c$ ~, t/ D. U. [4 y '''0 c. Y# M4 W/ u/ u* c7 m# A( t3 V
queries: [B, H] [batch_size,embedding_size]& U: C, y- z8 ]. l: \2 Y
keys: [B, T, H] [batch_size,T,embedding_size]( k Q, H1 b5 u- A" p
keys_length: [B] [batch_size]; R% r! [+ t& w+ H3 K
#T为历史行为序列长度
% Q3 G9 x/ [, ~. ~3 N8 O '''
0 P) }, ]/ { ?: ~/ E$ d# ~
: Q0 C. j( \) J" v #(?,32)->(None,32)->32- Z) s Q' H: v# s0 H' y
# tile()函数是用来对张量(Tensor)进行扩展的,其特点是对当前张量内的数据进行一定规则的复制。最终的输出张量维度不变
- G$ y6 [- B8 n' Q" ^, E* ~ # tf.shape(keys)[1]==T6 P: x2 t# W3 i* L/ m) x* L
# 对queries的维度进行reshape1 M& C0 [9 \# d7 L a
# (?,T,32)这里是为了让queries和keys的维度相同而做的操作
1 n9 w: s- S$ w& S # (?,T,128)把u和v以及u v的element wise差值向量合并起来作为输入,3 m" g( e7 {( p7 Q# A
# 然后喂给全连接层,最后得出两个item embedding,比如u和v的权重,即g(Vi,Va)+ ]# ?) n0 l; H) h3 U( z! y
% }! c. Q' }4 e6 y
queries_hidden_units = queries.get_shape().as_list()[-1]
H$ N, R+ p; p8 [1 t* p6 R queries = tf.tile(queries,[1,tf.shape(keys)[1]])9 u9 Z5 z1 X5 d
queries = tf.reshape(queries,[-1,tf.shape(keys)[1],queries_hidden_units])( H/ j _* @4 m& E8 {: J+ Y
din_all = tf.concat([queries,keys,queries-keys,queries * keys],axis=-1) # B*T*4H
" p. Y' x; g3 B& `" G
. t. q5 P" z3 K7 E8 J # 三层全链接(d_layer_3_all为训练出来的atteneion权重)2 P, }$ d, _6 J- G6 r* P9 t: v) P" C
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att')6 o& z o: N! u
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att')
* _2 S0 q/ X) L6 R! K d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att') #B*T*1
. i+ | K& g1 q+ E2 p9 s, E D
3 i m6 V1 M3 E4 B$ p #为了让outputs维度和keys的维度一致
# A* c0 s; ]4 Q \+ {: d1 q outputs = tf.reshape(d_layer_3_all,[-1,1,tf.shape(keys)[1]]) #B*1*T- `: j" x' D9 X0 I7 O
% j- E& v i& p
# bool类型 tf.shape(keys)[1]为历史行为序列的最大长度,keys_length为人为设定的参数,
+ \, [5 G' f- J( ]( Y9 ] # 如tf.sequence_mask(5,3) 即为array[True,True,True,False,False]
4 V( R) {8 |: Q5 M # 函数的作用是为了后面补齐行为序列,获取等长的行为序列做铺垫! |( s8 x1 v: C
key_masks = tf.sequence_mask(keys_length,tf.shape(keys)[1])4 @# t% M6 Y: ~( j+ W4 p. h
+ C1 E2 j+ f; Q8 I8 s: D #在第二维增加一维,也就是由B*T变成B*1*T
( C/ e) s/ d5 K key_masks = tf.expand_dims(key_masks,1) # B*1*T
1 D4 p. m3 B6 f1 C3 E- n; n3 f0 v1 D+ X8 J: x4 w2 f
#tf.ones_like新建一个与output类型大小一致的tensor,设置填充值为一个很小的值,而不是0,padding的mask后补一个很小的负数,这样softmax之后就会接近0
( V0 n) ^# n# w- o( m paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)2 V' I+ Y6 W* x% T# k
4 S% F n: F1 Q" ]6 g# S5 A6 |! u #填充,获取等长的行为序列
4 ^5 X7 }% _ [( |% O # tf.where(condition, x, y),condition是bool型值,True/False,返回值是对应元素,condition中元素为True的元素替换为x中的元素,为False的元素替换为y中对应元素
/ x( Q" \& z, T: M$ k1 \ #由于是替换,返回值的维度,和condition,x , y都是相等的。
* t8 u6 q0 q7 u7 I outputs = tf.where(key_masks,outputs,paddings) # B * 1 * T
: q N, k5 x3 N# [3 W( I
4 [# i3 l1 d# c8 [( I # Scale(缩放)
4 e B9 t* J3 K outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)+ p% [; q; }# Y1 ?' _! P. k7 {
# Activation
" \6 B! B" ]5 J; X. E outputs = tf.nn.softmax(outputs) # B * 1 * T6 |$ ^2 ^2 T/ Z7 ^( [* O: `
# Weighted Sum outputs=g(Vi,Va) keys=Vi
/ K8 P- P5 w6 ^" I #这步为公式中的g(Vi*Va)*Vi9 Q0 `- a8 S& j
outputs = tf.matmul(outputs,keys) # B * 1 * H 三维矩阵相乘,相乘发生在后两维,即 B * (( 1 * T ) * ( T * H ))
( G, ` W# Z$ K) g1 f- d5 g
8 F- H7 O( q% ~3 k( q$ H% F g return outputs5 _$ D; |) b, K V- I% d" b
下一篇文章中将详细讲述在实际的业务中,如何应用DIN算法,敬请期待。 |