您的位置:首页 > 其它

布隆过滤器

2016-05-11 22:47 253 查看
(一)原理

布隆过滤器的原理实际上就是位图和哈希表的互补,位图省空间,哈希表省时间。这样便可以在位图中查询K(某字符串)的存在与否

(二)代码实现如下

BiMap.h

#pragma once

#include <vector>

class BitMap
{
public:
BitMap()
:_size(0)
{}

BitMap(size_t size)//这里size_t是4个字节,共32位,所以一个size可以表示32个数的状态
:_size(0)
{
_arrays.resize((size >> 5) + 1);//由于当size小于8的时候,右移5位后可能是0,因此加上1,保证最少开辟1个字节
}

bool Set(size_t num)
{
size_t index = num >> 5;//相当于除以32,找到它是第几个数,即index
size_t n = num % 32;//%32找到num在位图的第index个数上的第n位

if (_arrays[index] & (1 << n))//由于调用一次Set,_size就会加一,这样_size就不准了。
//这样写就避免这个问题了
{
return false;
}

_arrays[index] |= (1 << n);
++_size;
return true;
}

bool ReSet(size_t num)
{
size_t index = num >> 5;
size_t n = num % 32;

if (_arrays[index] & (1 << n))//原先这个位是1,现在改成0,当然_size也要减一
{
_arrays[index] &= (~(1 << n));
--_size;
return true;
}
else
{
return false;
}
}

bool Test(size_t num)
{
size_t index = num >> 5;
size_t n = num % 32;

return _arrays[index] & (1 << n);
}

void Clear()//置空该位图
{
_arrays.assign(_arrays.size(), 0);
}
void Resize(size_t size)
{
_arrays.resize((size >> 5) + 1);
}

protected:
vector<size_t> _arrays;
size_t _size;
};

void Test1()
{
BitMap bm(65);
bm.Set(1);
bm.Set(4);
bm.Set(33);

cout << "1?" << bm.Test(1) << endl;
cout << "2?" << bm.Test(2) << endl;
cout << "4?" << bm.Test(4) << endl;
cout << "33?" << bm.Test(33) << endl;

bm.ReSet(33);
bm.ReSet(4);

cout << "1?" << bm.Test(1) << endl;
cout << "2?" << bm.Test(2) << endl;
cout << "4?" << bm.Test(4) << endl;
cout << "33?" << bm.Test(33) << endl;
}

void Test2()
{
BitMap bm(-1);
bm.Set(10000000);
}
Bloom.h

# include "BitMap.h"
class Bloom
{
private:
BitMap Map;
size_t _capacity;
public:
Bloom(size_t size)
{
_capacity = _GetNextPrime(size);
Map.Resize(_capacity);
}
void Set(const char *&key)
{
size_t index1 = BKDRHash(key);
size_t index2 = SDBMHash(key);
size_t index3 = RSHash(key);
size_t index4 = APHash(key);
size_t index5 = JSHash(key);
Map.Set(index1%_capacity);
Map.Set(index2%_capacity);
Map.Set(index3%_capacity);
Map.Set(index4%_capacity);
Map.Set(index5%_capacity);
}
void Set(const string & key)
{
size_t index1 = HashFunc1()(key);
size_t index2 = HashFunc2()(key);
size_t index3 = HashFunc3()(key);
size_t index4 = HashFunc4()(key);
size_t index5 = HashFunc5()(key);

Map.Set(index1%_capacity);
Map.Set(index2%_capacity);
Map.Set(index3%_capacity);
Map.Set(index4%_capacity);
Map.Set(index5%_capacity);
}

bool IsIn(const string & key)
{
size_t index1 = HashFunc1()(key);
if (!Map.Test(index1%_capacity))
{
return false;
}
size_t index2 = HashFunc2()(key);
if (!Map.Test(index2%_capacity))
{
return false;
}
size_t index3 = HashFunc3()(key);
if (!Map.Test(index3%_capacity))
{
return false;
}
size_t index4 = HashFunc4()(key);
if (!Map.Test(index4%_capacity))
{
return false;
}
size_t index5 = HashFunc5()(key);
if (!Map.Test(index5%_capacity))
{
return false;
}

return true;
}
private:
static size_t BKDRHash(const char *str)
{
unsigned int seed = 131; // 31 131 1313 13131 131313
unsigned int hash = 0;
while (*str)
{
hash = hash * seed + (*str++);
}

return (hash & 0x7FFFFFFF);
}

static size_t SDBMHash(const char *str)
{
register size_t hash = 0;
while (size_t ch = (size_t)*str++)
{
hash = 65599 * hash + ch;
//hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;
}
return hash;
}

static size_t RSHash(const char *str)
{
register size_t hash = 0;
size_t magic = 63689;
while (size_t ch = (size_t)*str++)
{
hash = hash * magic + ch;
magic *= 378551;
}
return hash;
}

static size_t APHash(const char *str)
{
register size_t hash = 0;
size_t ch;
for (long i = 0; ch = (size_t)*str++; i++)
{
if ((i & 1) == 0)
{
hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
}
else
{
hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
}
}
return hash;
}

static size_t JSHash(const char *str)
{
if (!*str)
return 0;

register size_t hash = 1315423911;
while (size_t ch = (size_t)*str++)
{
hash ^= ((hash << 5) + ch + (hash >> 2));
}
return hash;
}
struct HashFunc1
{
size_t operator()(const string& key)
{
return BKDRHash(key.c_str());
}
};

struct HashFunc2
{
size_t operator()(const string & key)
{
return SDBMHash(key.c_str());
}
};

struct HashFunc3
{
size_t operator()(const string & key)
{
return RSHash(key.c_str());
}
};

struct HashFunc4
{
size_t operator()(const string & key)
{
return APHash(key.c_str());
}
};

struct HashFunc5
{
size_t operator()(const string & key)
{
return JSHash(key.c_str());
}
};

protected:
unsigned long _GetNextPrime(unsigned long num)
{
const int _PrimeSize = 28;
static const unsigned long _PrimeList[_PrimeSize] =
{
53ul, 97ul, 193ul, 389ul, 769ul,
1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
1610612741ul, 3221225473ul, 4294967291ul
};
size_t pos = 0;
while (pos < _PrimeSize)
{
if (_PrimeList[pos] > num)
{
break;
}
++pos;
}
return _PrimeList[pos];
}

};


# include<iostream>
using namespace std;
# include"Bloom.h"
int main()
{
Bloom b(90);
char *p = "afshrajsys";
b.Set(p);
cout << b.IsIn(p) << endl;
cout << b.IsIn("qq")<<endl;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: