Press "Enter" to skip to content

基于TensorFlow的FM实现

本站内容均来自兴趣收集,如不慎侵害的您的相关权益,请留言告知,我们将尽快删除.谢谢.

这一篇文章主要是FM的代码实现,以MovielLens数据集为例进行说明。

 

MovielLens数据集(ml-100k)包含了10万条评分记录,其中涉及了943个用户和1682个item,这里使用的是<user, item, rate>这样的数据形式。

 

我这里导入的是tf2.x的环境,但是是用tf1.x写的,因为工作中还是用的1x比较多。那幺如何在tf2.x的环境中应用1.x的功能呢?

 

tf.compat.v1.disable_eager_execution()

 

另外一个需要注意的点是,保存模型需要在每个epoch都要进行保存

 

for epoch in range(epochs):
    ... ...
# 保存模型
    self.saver.save(self.sess, "{}/tf_with_1x".format(self.modelpath))

 

OK,看代码实现,首先定义工具类,主要包含了三个部分的功能:

 

加载数据

 

   def load_dataset(self, train_path, test_path, mode):
        cols = ['user', 'item', 'rating', 'timestamp']
        train = pd.read_csv(train_path, delimiter='\t', names=cols)
test = pd.read_csv(test_path, delimiter='\t', names=cols)
print(train.user.values)
        X_train, ix = self.vectorize_dic({'users': train.user.values, 'items': train.item.values})
        X_test, ix = self.vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])
        y1 = train.rating.values
        y_train = np.zeros((len(y1), 1))
        y2 = test.rating.values
        y_test = np.zeros((len(y2), 1))
if mode == 'regression':
            y_train = y1.copy()
            y_test = y2.copy()
elif mode == 'classification':
            y_train[np.where(y1 == 5)] = 1
            y_train[np.where(y1 < 5)] = -1
            y_test[np.where(y2 == 5)] = 1
            y_test[np.where(y2 < 5)] = -1
return X_train, y_train, X_test, y_test

 

创建一个scipy csr matrix

 

    def vectorize_dic(self, dic, ix=None, p=None):
"""
        Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature)
        :param dic: dictionay of feature lists. Keys are the name of features
        :param ix:  index generator(default None)
        :param p: dimension of feature space (number of columns in the sparse matrix)
        :return:
        """
if (ix == None):
            d = count(0)
            ix = defaultdict(lambda: next(d))
# 样本数
        n = len(list(dic.values())[0])
# 特征数
        g = len(list(dic.keys()))
# 生成矩阵拆平之后的总长度
        nz = n * g
        col_ix = np.empty(nz, dtype=int)
        i = 0
for k, lis in dic.items():
# 从i位置开始,间隔 g
            col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
            i += 1
# np.repeat(np.arange(0, 10), 3)
# array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9])
        row_ix = np.repeat(np.arange(0, n), g)
        data = np.ones(nz)
if (p == None):
            p = len(ix)
        ixx = np.where(col_ix < p)
# 关于矩阵压缩 csr.csr_matrix参考:https://cloud.tencent.com/developer/article/1099721
return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

 

batch数据的返回

 

  def get_batcher(self, X_, y_=None, batch_size=None):
        n_samples = X_.shape[0]
if batch_size is None:
            batch_size = n_samples
for i in range(0, n_samples, batch_size):
            upper_bound = min(i + batch_size, n_samples)
            ret_x = X_[i:upper_bound]
            ret_y = None
if y_ is not None:
                ret_y = y_[i:i + batch_size]
                yield (ret_x, ret_y)

 

然后是定义FM模型,FM模型主要包含了几个部分:

 

加载数据

 

    def load_data(self):
      self.X_train, self.y_train, self.X_test, self.y_test = self.util.load_dataset(self.trainPath, self.testPath, self.mode)
      self.X_train = self.X_train.todense()
      self.X_test = self.X_test.todense()
print("Train data shape: ", self.X_train.shape)
print(self.X_train[:3])
print("Test data shape: ", self.X_test.shape)
print(self.X_test[:3])

 

创建模型

 

    def build_model(self):
      self.row_num, self.col_num = self.X_train.shape
# design matrix
      self.X = tf.compat.v1.placeholder('float', shape=[None, self.col_num])
# target vector
      self.y = tf.compat.v1.placeholder('float', shape=[None, 1])
# 偏置和权重
      w0 = tf.Variable(tf.zeros([1]))
      W = tf.Variable(tf.zeros([self.col_num]))
# 初始化向量矩阵
      self.V = tf.Variable(tf.random.normal([self.k, self.col_num], stddev=0.01))
# 创建结果值
      y_hat = tf.Variable(tf.zeros([self.row_num, 1]))
# 线性部分
      linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, self.X), 1, keepdims=True))
# 特征交叉部分 参考 https://mp.weixin.qq.com/s/mJpNwEDGqS7u-vtZ54zV6A 推导过程
      pair_interaction = (tf.multiply(0.5,
                                       tf.reduce_sum(
                                           tf.subtract(
                                               tf.pow(tf.matmul(self.X, tf.transpose(self.V)), 2),
                                               tf.matmul(tf.pow(self.X, 2), tf.transpose(tf.pow(self.V, 2)))
                                           ),
                                           1, keepdims=True)))
      self.y_hat = tf.add(linear_terms, pair_interaction)
# lambda_w = tf.constant(0.001, name='lambda_w')
# lambda_v = tf.constant(0.001, name='lambda_v')
      lambda_w = tf.constant(0.00, name='lambda_w')
      lambda_v = tf.constant(0.00, name='lambda_v')
      l2_norm = tf.add(
          tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W, 2))),
          tf.reduce_sum(tf.multiply(lambda_v, tf.pow(self.V, 2)))
      )
if self.mode == 'regression':
          self.error = tf.reduce_mean(tf.square(tf.subtract(self.y, self.y_hat)))
          self.loss = tf.add(self.error, l2_norm)
elif self.mode == 'classification':
print(self.y.get_shape().as_list())
print(self.y_hat.get_shape().as_list())
          self.error = tf.reduce_mean(tf.nn._cross_entropy_with_logits(labels=self.y, logits=self.y_hat))
          self.loss = tf.add(self.error, l2_norm)
print(self.loss.get_shape().as_list())
print(l2_norm.get_shape().as_list())
# self.optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.5).minimize(self.loss)
      self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(self.loss)

 

模型训练

 

    def train(self):
      epochs = 5
      batch_size = 256
      self.init = tf.compat.v1.global_variables_initializer()
      self.sess = tf.compat.v1.Session()
      self.saver = tf.compat.v1.train.Saver()  # 在构建网络后使用
      self.sess.run(self.init)
for epoch in range(epochs):
          perm = np.random.permutation(self.X_train.shape[0])
          cnt = 0
for batchX, batchY in self.util.get_batcher(self.X_train[perm], self.y_train[perm], batch_size):
              _, loss = self.sess.run((self.optimizer, self.loss), feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})
if cnt == 1:
print("Epoch: %d, Loss: %.3f" % (epoch + 1, loss))
              cnt += 1
# 保存模型
          self.saver.save(self.sess, "{}/tf_with_1x".format(self.modelpath))

 

模型评估

 

    def evaluate(self):
# 加载模型
      with tf.compat.v1.Session() as sess:
          sess.run(self.init)
          self.saver.restore(sess, "{}/tf_with_1x".format(self.modelpath))
print("模型加载成功 ...")
if self.mode == 'regression':
              errors = []
for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
                  errors.append(sess.run(self.error, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)}))
              RMSE = np.sqrt(np.array(errors).mean())
print("RMSE: ", RMSE)
elif self.mode == 'classification':
              pred = np.zeros((len(self.X_test), 1))
for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
                  logits = sess.run(self.y_hat, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})
                  y_hat = self.util.sigmoid(logits)
                  pred[np.where(y_hat > 0.5)] = 1
                  pred[np.where(y_hat < 0.5)] = -1
print("Accuracy: ", np.mean(self.y_test == pred))
      sess.close()

 

 

Be First to Comment

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注