局部敏感哈希(原始LSH)python实现
2016-01-27 18:12
816 查看
最近短期计划是学习一下Python,最好的学习方式当然是实践了,今天用Python实现了下lsh算法,代码比较简陋。。。(2016.1.17)
origionalLSH.py: import random class Bucket: features=[] name=[] def __init__(self): self.features=[] self.name=[] def addFeature(self,feature,nameID): self.features.append(feature) self.name.append(nameID) def size(self): return len(self.features) class Table: buckets=[] size=0 def __init__(self,tableSize): for i in xrange(tableSize): bucket=Bucket() self.buckets.append(bucket) def addFeature(self,feature,bucketID,nameID): self.buckets[bucketID].addFeature(feature,nameID) def size(self): return self.size class LSH: __m_k = 0 __m_d = 0 __m_l = 0 __m_M = 0 __m_FeatureDims = 0 __m_hashFun_level1Subset =[] __m_hashFun_level2=[] __m_table = Table(0) __m_MaxValue = 0 def __init__(self,maxValue,l,m,ratio,featureDims): self.__m_k = int(maxValue*featureDims*ratio) self.__m_d = maxValue*featureDims self.__m_l = l self.__m_M = m self.__m_FeatureDims = featureDims self.__m_MaxValue = maxValue self.__m_table = Table(self.__m_M) # __m_hashFun_level1Subset用初始化么? for i in xrange(m): temp0 =[] for j in xrange(featureDims): tem1 = [] temp0.append(tem1) self.__m_hashFun_level1Subset.append(temp0) for i in xrange(self.__m_k): self.__m_hashFun_level2.append(random.randint(0,self.__m_M-1)) def __GetHashFun_level1(self): #生成一级哈希函数 for i in xrange(self.__m_M): for j in xrange(self.__m_k): val = random.randint(0,self.__m_d-1)#随机选取位置 pos = int(val/self.__m_MaxValue) #对应原始特征的哪一维 self.__m_hashFun_level1Subset[i][pos].append(val) def __Hash_level1(self,feature,tableID): value = [] table = self.__m_hashFun_level1Subset[tableID] for i in xrange(len(table)): val0 = feature[i] one_num = 0 zero_num = 0 for j in xrange(len(table[i])): val1 = table[i][j]-self.__m_d*i if val1<=val0: one_num +=1 else: zero_num +=1 while one_num > 0: value.append(1) one_num -=1 while zero_num > 0: value.append(0) zero_num -=1 return value def __HashLevel2(self,value,): butketID = -1 temp = 0 for i in xrange(len(value)): temp += self.__m_hashFun_level2[i]*value[i] bucketID = temp % self.__m_M return bucketID def train(self,features): self.__GetHashFun_level1() num = len(features) #特征个数 for i in xrange(num): feature = features[i] for j in xrange(self.__m_l): value = self.__Hash_level1(feature,j) bucketID = self.__HashLevel2(value) self.__m_table.addFeature(feature,bucketID,i) def search(self,feature): name = -1 dist = -1 minDist = 10000000 buckets = [] #hash 获取所有候选bucket for i in xrange(self.__m_l): value = self.__Hash_level1(feature,i) bucketID = self.__HashLevel2(value) buckets.append(bucketID) print"查找时遍历痛的个数为: %d" %(len(buckets)) for i in xrange(len(buckets)):#遍历候选bucket tempFeatures = self.__m_table.buckets[i].features tempName = self.__m_table.buckets[i].name num = len(tempFeatures) print "该桶含有的特征个数为:%d" %(num) for j in xrange(num): dist = self.__calcDist(feature,tempFeatures[j]) if dist < minDist: minDist = dist name = tempName[j] return name,minDist def __calcDist(self,feature0,feature1): dist = 0 length = len(feature0) for i in xrange(length): dist += abs(feature0[i]-feature1[i]) return dist # code:utf-8
test.py: from origionalLSH import * featureNum = 10000 featureLength = 40 #step1: 生成特征 print "step1: 生成特征" features = [] for i in xrange(featureNum): temp = [] for j in xrange(featureLength): temp.append(random.randint(0,255)) features.append(temp) #step2: LSH初始化 print "step2: LSH初始化" #LSH lsh(255,10,100,0.1,featureLength) lsh =LSH(255,10,100,0.12,featureLength) #step3: 开始训练 print "step3: 开始训练" lsh.train(features) #step4: search: print"step4: search:" name,dist = lsh.search(features[457]) print "最近的距离为:%d, 行号为%d" %(dist,name)
相关文章推荐
- python安装tar.gz和whl模块
- Python import
- Python学习之路-初学篇之Python的基本数据类型(一)
- python excel操作
- 构建Python+Selenium2自动化测试环境<一>
- python 脚本查看微信把你删除的好友--win系统版
- python学习新得
- Python字符串笔录
- python 类的简单介绍
- Python 第八章 进程和线程
- day2 学习python 字符及字符串
- 模重复平方计算法(快速幂)【Python实现】<信安数论>
- Python & JSON
- Python2.7 读取中文,显示为乱码的问题
- ubuntu14.04 源码安装 Python2.7.11
- Black Hat python-2.1
- python Pillow 模块安装
- 1.27 python21行代码拼写检测 50行代码检测语言
- Python 7.4 序列化
- map_reduce的简单运用