计算机系统应用教程网站

网站首页 > 技术文章 正文

基于Movielens-1M数据集和相似性矩阵实现的电影推荐算法(附源码)

btikc 2024-09-24 08:27:12 技术文章 24 ℃ 0 评论

源码已经放在GitHub上。

源码地址:https://github.com/Lockvictor/MovieLens-RecSys

README.md

MovieLens-RecSys

基于Movielens-1M数据集实现的User Based Collaborative Filtering和Item Based Collaborative Filtering推荐算法

简介

项亮所著的《推荐系统实践》一书是非常优秀的推荐系统入门书籍,但书中只描述了单步的计算如何实现,缺乏一个完整的示例来展示推荐系统从建立数据集到评估模型的整个过程,初学者学起来容易迷茫,因此我基于Movielens 1M数据集分别实现了User Based Collaborative Filtering(以下简称UserCF)和Item Based Collaborative Filtering(以下简称ItemCF)两个算法,包含“切分训练集与测试集-训练模型-推荐-评估”一整套流程,可以帮助初学者更快速地理解推荐系统中的协同过滤算法。

程序最终给出的是Precision、Recall、Coverage、Popularity四项衡量模型质量的指标,而具体的电影推荐结果并未保留,如果需要此部分数据可自行修改代码。

核心源码:


#-*- coding: utf-8 -*-
'''
Created on 2015-06-22
@author: Lockvictor
'''
import sys
import random
import math
import os
from operator import itemgetter
random.seed(0)
class ItemBasedCF(object):
''' TopN recommendation - Item Based Collaborative Filtering '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie = 20
self.n_rec_movie = 10
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
print('Recommended movie number = %d' %
self.n_rec_movie, file=sys.stderr)
@staticmethod
def loadfile(filename):
''' load a file, return a generator. '''
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print ('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close()
print ('load %s succ' % filename, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating, _ = line.split('::')
# split the data by pivot
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print ('split training set and test set succ', file=sys.stderr)
print ('train set = %s' % trainset_len, file=sys.stderr)
print ('test set = %s' % testset_len, file=sys.stderr)
def calc_movie_sim(self):
''' calculate movie similarity matrix '''
print('counting movies number and popularity...', file=sys.stderr)
for user, movies in self.trainset.items():
for movie in movies:
# count item popularity
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('count movies number and popularity succ', file=sys.stderr)
# save the total number of movies
self.movie_count = len(self.movie_popular)
print('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated users between items
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix...', file=sys.stderr)
for user, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1, {})
itemsim_mat[m1].setdefault(m2, 0)
itemsim_mat[m1][m2] += 1
print('build co-rated users matrix succ', file=sys.stderr)
# calculate similarity matrix
print('calculating movie similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt(
self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print('calculating movie similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print('calculate movie similarity matrix(similarity factor) succ',
file=sys.stderr)
print('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
def recommend(self, user):
''' Find K similar movies and recommend N movies. '''
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(),
key=itemgetter(1), reverse=True)[:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
def evaluate(self):
''' print evaluation result: precision, recall, coverage and popularity '''
print('Evaluation start...', file=sys.stderr)
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print ('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()

#-*- coding: utf-8 -*-
'''
Created on 2015-06-22
@author: Lockvictor
'''
import sys
import random
import math
import os
from operator import itemgetter
random.seed(0)
class UserBasedCF(object):
''' TopN recommendation - User Based Collaborative Filtering '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_user = 20
self.n_rec_movie = 10
self.user_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print ('Similar user number = %d' % self.n_sim_user, file=sys.stderr)
print ('recommended movie number = %d' %
self.n_rec_movie, file=sys.stderr)
@staticmethod
def loadfile(filename):
''' load a file, return a generator. '''
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print ('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close()
print ('load %s succ' % filename, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating, _ = line.split('::')
# split the data by pivot
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print ('split training set and test set succ', file=sys.stderr)
print ('train set = %s' % trainset_len, file=sys.stderr)
print ('test set = %s' % testset_len, file=sys.stderr)
def calc_user_sim(self):
''' calculate user similarity matrix '''
 # build inverse table for item-users
 # key=movieID, value=list of userIDs who have seen this movie
print ('building movie-users inverse table...', file=sys.stderr)
movie2users = dict()
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print ('build movie-users inverse table succ', file=sys.stderr)
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print ('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated items between users
usersim_mat = self.user_sim_mat
print ('building user co-rated movies matrix...', file=sys.stderr)
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
continue
usersim_mat.setdefault(u, {})
usersim_mat[u].setdefault(v, 0)
usersim_mat[u][v] += 1
print ('build user co-rated movies matrix succ', file=sys.stderr)
# calculate similarity matrix
print ('calculating user similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.items():
for v, count in related_users.items():
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print ('calculating user similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print ('calculate user similarity matrix(similarity factor) succ',
file=sys.stderr)
print ('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
def recommend(self, user):
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[similar_user]:
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += similarity_factor
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' print evaluation result: precision, recall, coverage and popularity '''
print ('Evaluation start...', file=sys.stderr)
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print ('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
usercf = UserBasedCF()
usercf.generate_dataset(ratingfile)
usercf.calc_user_sim()
usercf.evaluate()

运行

  1. 获取代码

    根据自己使用的Python版本获取相应的分支

    Python 3.x:

    git clone https://github.com/Lockvictor/MovieLens-RecSys.git

    Python 2.x:

    git clone -b python2 https://github.com/Lockvictor/MovieLens-RecSys.git

    如果不使用Git,也可在Github页面上手动选择分支然后下载。

  2. 下载数据集

    下载Movielens 1M数据集ml-1m.zip,并解压到项目MovieLens-RecSys文件夹下

  3. 运行代码

    以UserCF为例,直接在终端运行以下命令即可:

# 部分Linux上会同时存在Python的2和3两个版本,3.x版对应的命令是python3# Windows用户无论安装的是2或3,
命令都是pythonpython usercf.py#python3 usercf.py

Linux用户的话更推荐下面这个命令:

python usercf.py > run.log 2>&1 &#python3 usercf.py > run.log 2>&1 &

该命令会让程序在后台运行,可以等待运行结束再查看日志,或者通过tail -f run.log即时查看日志。

注意事项

UserCF算法中,由于用户数量多,生成的相似性矩阵也大,会占用比较多的内存,不过一般电脑都没问题。

ItemCF算法中,每次推荐都需要找出一个用户的所有电影,再为每一部电影找出最相似的电影,运算量比UserCF大,因此推荐的过程比较慢。



如在手机上遇到 代码混乱,无法查看,可以分享到 企鹅、或者wechat ,然后进行查看,或者在电脑上查看。

我也写了很多其他的非常简单的入门级的爬虫详细教程,关注后,点击我的头像,就可以查看到。

欢迎大家一起留言讨论和交流,谢谢!


Tags:

本文暂时没有评论,来添加一个吧(●'◡'●)

欢迎 发表评论:

最近发表
标签列表