UBCF和IBCF的python实现和比较
2015-08-01 15:08
911 查看
进行实验的源代码参考了项亮的《推荐系统实践》和http://my.oschina.net/zhangjiawen/blog/185625
数据集为movieLens的100k数据集,在Windows上实验,运行结果如下:
分析发现:
1)粗略看去UBCF和IBCF的各项评测指标相差不大,但是IBCF的运行时间大大超过了UBCF。
2.代码改进1
分析发现共现矩阵C是对称矩阵,占用了大量内存。所以在给C赋值时,保证物品i的id大于j的。节省了一半空间。
1.版本1——核心代码为书中代码
import random import math import time class UserBasedCF: def __init__(self,datafile = None): self.datafile = datafile self.readData() self.splitData(3,47) def readData(self,datafile = None): """ read the data from the data file which is a data set 把文件中的内容读到data中""" self.datafile = datafile or self.datafile self.data = [] for line in open(self.datafile): userid,itemid,record,_ = line.split() self.data.append((userid,itemid,int(record))) def splitData(self,k,seed,data=None,M = 8): """ split the data set testdata is a test data set traindata is a train set test data set / train data set is 1:M-1 """ self.testdata = {} self.traindata = {} data = data or self.data random.seed(seed) for user,item, record in self.data: if random.randint(0,M) == k: self.testdata.setdefault(user,{}) self.testdata[user][item] = record else: self.traindata.setdefault(user,{}) self.traindata[user][item] = record def userSimilarity(self,train = None): train = train or self.traindata self.userSim = dict() for u in train.keys(): for v in train.keys(): if u == v: continue self.userSim.setdefault(u,{}) self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys())) self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0) def userSimilarityBest(self,train = None): """ the other method of getting user similarity which is better than above you can get the method on page 46 In this experiment,we use this method """ train = train or self.traindata self.userSimBest = dict() item_users = dict() for u,item in train.items(): for i in item.keys(): item_users.setdefault(i,set()) item_users[i].add(u) user_item_count = dict() count = dict() for item,users in item_users.items(): for u in users: user_item_count.setdefault(u,0) user_item_count[u] += 1 for v in users: if u == v:continue count.setdefault(u,{}) count[u].setdefault(v,0) count[u][v] += 1 for u ,related_users in count.items(): self.userSimBest.setdefault(u,dict()) for v, cuv in related_users.items(): self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0) def recommend(self,user,train = None,k = 8,nitem = 40): train = train or self.traindata rank = dict() interacted_items = train.get(user,{}) for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]: for i , rvi in train[v].items(): if i in interacted_items: continue rank.setdefault(i,0) rank[i] += wuv*rvi return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem]) def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10): """ Get the recall and precision, the method you want to know is listed in the page 43 """ train = train or self.traindata test = test or self.testdata hit = 0 recall = 0 precision = 0 for user in train.keys(): tu = test.get(user,{}) rank = self.recommend(user, train = train,k = k,nitem = nitem) for item,_ in rank.items(): if item in tu: hit += 1 recall += len(tu) precision += nitem return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,train = None,test = None,k = 8,nitem = 10): train = train or self.traindata test = test or self.testdata recommend_items = set() all_items = set() for user in train.keys(): for item in train[user].keys(): all_items.add(item) rank = self.recommend(user, train, k = k, nitem = nitem) for item,_ in rank.items(): recommend_items.add(item) return len(recommend_items) / (len(all_items) * 1.0) def popularity(self,train = None,test = None,k = 8,nitem = 10): """ Get the popularity the algorithm on page 44 """ train = train or self.traindata test = test or self.testdata item_popularity = dict() for user ,items in train.items(): for item in items.keys(): item_popularity.setdefault(item,0) item_popularity[item] += 1 ret = 0 n = 0 for user in train.keys(): rank = self.recommend(user, train, k = k, nitem = nitem) for item ,_ in rank.items(): ret += math.log(1+item_popularity[item]) n += 1 return ret / (n * 1.0) class ItemBasedCF(object): def __init__(self,datafile = None): print("init:") self.datafile = datafile self.readData(self.datafile) self.splitData(3,47) def readData(self,datafile = None): """ read the data from the data file which is a data set """ print("ireadData:") self.datafile = datafile or self.datafile self.data = [] for line in open(self.datafile): userid,itemid,record,_ = line.split() self.data.append((userid,itemid,int(record))) # 格式 [('196', '242', 3), ('186', '302', 3), ('22', '377', 1)] def splitData(self,k,seed,data=None,M = 8): """ split the data set testdata is a test data set traindata is a train set test data set / train data set is 1:M-1 """ self.testdata = {} self.traindata = {} data = data or self.data random.seed(seed) for user,item, record in self.data: if random.randint(0,M) == k: self.testdata.setdefault(user,{}) self.testdata[user][item] = record else: self.traindata.setdefault(user,{}) self.traindata[user][item] = record #print(self.testdata) # 格式{'291': {'1042': 4, '118': 2}, '200': {'222': 5}, '308': {'1': 4}, '167': {'486': 4}, '122': {'387': 5}, '210': {'40': 3}, def ItemSimilarity(self,train = None): """ the other method of getting user similarity which is better than above you can get the method on page 46 In this experiment,we use this method """ train = train or self.traindata self.itemSimBest = dict() N = dict() C = dict() for u,items in train.items(): for i in items: N.setdefault(i,0) N[i] += 1 for j in items: if i == j: continue C.setdefault(i,{}) C[i].setdefault(j,0) C[i][j] += 1 for i,related_items in C.items(): self.itemSimBest.setdefault(i,dict()) for j,cij in related_items.items(): self.itemSimBest[i][j] = cij / math.sqrt(N[i] * N[j] * 1.0) def recommend(self,user,train = None,k = 8,nitem = 10): train = train or self.traindata rank = dict() ru = train.get(user,{}) for i,pi in ru.items(): for j ,wj in sorted(self.itemSimBest[i].items(),key = lambda x : x[1],reverse = True)[0:k]: if j in ru: continue rank.setdefault(j,0) rank[j] += pi * wj return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem]) def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10): """ Get the recall and precision, the method you want to know is listed in the page 43 """ train = train or self.traindata test = test or self.testdata hit = 0 recall = 0 precision = 0 for user in train.keys(): tu = test.get(user,{}) rank = self.recommend(user, train = train,k = k,nitem = nitem) for item,_ in rank.items(): if item in tu: hit += 1 recall += len(tu) precision += nitem return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,train = None,test = None,k = 8,nitem = 10): train = train or self.traindata test = test or self.testdata recommend_items = set() all_items = set() for user in train.keys(): for item in train[user].keys(): all_items.add(item) rank = self.recommend(user, train, k = k, nitem = nitem) for item,_ in rank.items(): recommend_items.add(item) return len(recommend_items) / (len(all_items) * 1.0) def popularity(self,train = None,test = None,k = 8,nitem = 10): """ Get the popularity the algorithm on page 44 """ train = train or self.traindata test = test or self.testdata item_popularity = dict() for user ,items in train.items(): for item in items.keys(): item_popularity.setdefault(item,0) item_popularity[item] += 1 ret = 0 n = 0 #对每一个user进行推荐 计算其流行度 for user in train.keys(): rank = self.recommend(user, train, k = k, nitem = nitem) for item ,_ in rank.items(): ret += math.log(1+item_popularity[item]) n += 1 return ret / (n * 1.0) def testUBCFRecommend(): ubcf = ItemBasedCF('./ml-100k/u.data') ubcf.readData() ubcf.splitData(4,100) ubcf.ItemSimilarity() user = "345" rank = ubcf.recommend(user,k = 3) for i,rvi in rank.items(): items = ubcf.testdata.get(user,{}) record = items.get(i,0) print ("%5s: %.4f--%.4f" %(i,rvi,record)) def testUserBasedCF(): startTime = time.clock() cf = UserBasedCF('./ml-100k/u.data') cf.userSimilarityBest() print ("%3s%20s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity','time')) for k in [5,10,20,40,80,160]: recall,precision = cf.recallAndPrecision( k = k) coverage = cf.coverage(k = k) popularity = cf.popularity(k = k) print ("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f%19.3fs" % (k,recall * 100,precision * 100,coverage * 100,popularity,time.clock()-startTime)) def testIBCFRecommend(): ibcf = ItemBasedCF('./ml-100k/u.data') ibcf.readData() ibcf.splitData(4,100) ibcf.ItemSimilarity() user = "345" rank = ibcf.recommend(user,k = 3) for i,rvi in rank.items(): items = ibcf.testdata.get(user,{}) record = items.get(i,0) print ("%5s: %.4f--%.4f" %(i,rvi,record)) def testItemBasedCF(): startTime = time.clock() cf = ItemBasedCF('./ml-100k/u.data') cf.ItemSimilarity() print ("%3s%20s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity','time')) for k in [5,10,20,40,80,160]: recall,precision = cf.recallAndPrecision( k = k) coverage = cf.coverage(k = k) popularity = cf.popularity(k = k) print ("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f%19.3fs" % (k,recall * 100,precision * 100,coverage * 100,popularity,time.clock()-startTime)) if __name__ == "__main__": testItemBasedCF() testUserBasedCF()
数据集为movieLens的100k数据集,在Windows上实验,运行结果如下:
分析发现:
1)粗略看去UBCF和IBCF的各项评测指标相差不大,但是IBCF的运行时间大大超过了UBCF。
2.代码改进1
分析发现共现矩阵C是对称矩阵,占用了大量内存。所以在给C赋值时,保证物品i的id大于j的。节省了一半空间。
相关文章推荐
- [python]如何快速地翻转字符串
- Python进阶之偏函数
- [python 爬虫学习]利用cookie模拟网站登录
- python模块 - 常用模块推荐
- python模块 - 其它常用模块推荐
- 装饰模式和python装饰器
- 利用Python监测MySQL主从状态
- python 中文乱码问题2
- python菜鸟日记1
- Ubuntu下使用Python+flask+MySQL的架构搭建Web框架
- Python进阶之装饰器@decorator
- python去重(针对密码)
- virtualenv
- PYTHON 双引号 单引号 多引号区别
- Python中何时使用断言 assert
- python 抓取天气时碰到的一点正则问题 (.*)和(.*?)
- 详解Python3中的Sequence type的使用
- python调用Java-JPype使用介绍
- python matplotlib 基础
- Python基础教程笔记——第1章