您的位置：首页 > 其它

提取某日访问次数最多的那个IP

2011-11-17 22:12 393 查看

海量数据日志中，提取出某日访问次数最多的那个IP。

思路：对于海量数据的处理，主要采取的策略就是分而治之，即缩减问题的规模，将一个大的问题划分成若干等价的小问题。然后解决这些小问题，最后将获得的小问题解综合起来，得出原问题的解。用到比较多的技术主要有散列、位图、堆、trie树、mapreduce、K路归并（败者树）等。其中散列用的尤为多。

对于本问题，假定某日访问的IP地址已经从数据日志中提取出来，存放在一个大的二进制文件中。下面的工作主要是找目标IP——文件中出现次数最多的那个IP。这个文件很大，内存无法完全放下，内排序的方法行不通。可以采取如下措施：

（1）利用散列函数，将大文件中的IP地址散列到若干个文件中。相同的IP地址肯定在同一个文件中。

（2）处理每个小文件，找到该文件中出现次数最多的那个IP，记录下IP地址和出现次数。可以用hash_map，IP地址为键值、出现次数为数值。

（3）将第（2）步中找到的IP地址及出现次数综合起来，找到这些IP地址中出现次数最多的那个IP。

简单实现：接下来给出一种简单的实现，效率比较低。测试中，从一个含4亿个IP地址的文件中提取目标IP，一共用了52分钟。其中大量的时间用于文件的读写，约为30分钟。另外有7分钟用于产生含4亿个随机数的文件。真正用于计算的时间为15分钟。由于C++标准STL中没有hash_map，因此该用map实现第（2）步，如果改用hash_map，应该能减少部分计算的时间。

另外，如果设置读写缓冲区，经过测试，缓冲区为128字节时，读写文件的时间从原来的30分钟减为25分钟左右。进一步增大缓冲区大小，提升的速度比非常小，待求解。这里设置缓冲区不是指这种方式：

char
buffer[1024];

streambuf
* ptrbuf = outFile.rdbuf();

ptrbuf->
pubsetbuf(buffer,1024);

而是定义一个整形数组，每次读写时，读写一块数据而不是一个整数。

单个读写
outFile.write((char*)&x,sizeof(unsigned));

块读写
outFile.write((char *)buffer,BUFFER_SIZE*sizeof(unsigned));

VC6.0下编译运行通过

view
plainprint ?

#pragma warning(disable:4786) //VC6.0中忽略警告

#include <fstream>

#include <iostream>

#include <map>

#include <string>

#include <ctime>

using namespace std;

const unsigned N=400000000; //随机产生的IP地址数

const unsigned FILE_NUM=16; //产生的小文件个数

const unsigned HASH_SHIFT=28; //散列值的位移量

inline unsigned HashInt(unsigned value); //将整数散列到0到FILE_NUM之间

bool ProduceIP(string fileName); //随机产生IP地址，看成是32位无符号数

bool DecomposeFile(string fileName); //分而治之，将大文件分为若干个小文件

bool FindTargetIP(unsigned result[2]); //找到出现次数最多的IP

int main()

{

unsigned start,end; //记录总的运行时间

unsigned start1,end1; //产生大文件的时间

unsigned start2,end2; //分解大文件的时间

unsigned start3,end3; //找出现IP次数最多的时间

string name="IP.bin"; //大文件

unsigned result[2]={0,0}; //保存结果

start=clock();

start1=clock();

//随机产生大量IP

if(ProduceIP(name)==false)

return 1;

end1=clock();

start2=clock();

//分而治之

if(DecomposeFile(name)==false)

return 1;

end2=clock();

start3=clock();

//找到出现次数最多的IP

if(FindTargetIP(result)==false)

return 1;

end3=clock();

end=clock();

//打印结果

cout<<"total run time : "<<(end-start)/1000.0<<endl;

cout<<"ProduceIP() run time : "<<(end1-start1)/1000.0<<endl;

cout<<"DecomposeFile() run time : "<<(end2-start2)/1000.0<<endl;

cout<<"FindTargetIP() run time : "<<(end3-start3)/1000.0<<endl;

cout<<"IP : "<<(result[0]>>24)<<'.'<<((result[0]&0x00ff0000)>>16)<<'.';

cout<<((result[0]&0x0000ff00)>>8)<<'.'<<((result[0]&0x000000ff))<<endl;

cout<<"appear time : "<<result[1]<<endl;

return 0;

}

//将整数散列到0到FILE_NUM之间

inline unsigned HashInt(unsigned value)

{

//斐波那契(Fibonacci)散列法 hash_key=(value * M) >> S;

//value是16位整数，M = 40503

//value是32位整数，M = 2654435769

//value是64位整数，M = 11400714819323198485

//S与桶的个数有数，如果桶的个数为16，那么S为28

//对于32位整数，S=32-log2(桶的个数)

return (value*2654435769)>>HASH_SHIFT;

}

//随机产生IP地址看成是32位无符号数

bool ProduceIP(string fileName)

{

ofstream outFile(fileName.c_str(),ios::binary);

if(!outFile)

{

cerr<<"error: unable to open output file : "<<fileName<<endl;

return false;

}

srand(time(0));

for(unsigned i=0;i<N;i++)

{

//产生一个大整数用来模拟IP地址

unsigned x=((rand()%256)<<24)|((rand()%256)<<16)|((rand()%256)<<8)|(rand()%256);

outFile.write((char*)&x,sizeof(unsigned));

}

return true;

}

//分而治之将大文件分为若干个小文件

bool DecomposeFile(string fileName)

{

ofstream outFiles[FILE_NUM];

int i;

for(i=0;i<FILE_NUM;i++)

{

//小文件的名称

char buffer[10];

string name="tmp";

itoa(i,buffer,10);

name=name+buffer+".bin";

//打开小文件

outFiles[i].open(name.c_str(),ios::binary);

if(!outFiles[i])

{

cerr<<"error: unable to open output file :"<<name<<endl;

return false;

}

}

ifstream inFile(fileName.c_str(),ios::binary);

while(inFile.good())

{

//散列到各个小文件中

unsigned int value=0;

if(inFile.read((char*)&value,sizeof(unsigned)))

{

outFiles[HashInt(value)].write((char*)&value,sizeof(unsigned));

}

}

//关闭文件

inFile.close();

for(i=0;i<FILE_NUM;i++)

outFiles[i].close();

return true;

}

//找到出现次数最多的IP

bool FindTargetIP(unsigned result[2])

{

result[0]=0;

result[1]=0;

for(int i=0;i<FILE_NUM;i++)

{

char buffer[10];

string name="tmp";

itoa(i,buffer,10);

name=name+buffer+".bin";

//处理每个小文件

ifstream inFile;

inFile.open(name.c_str(),ios::binary);

if(!inFile)

{

cerr<<"error: unable to open input file :"<<name<<endl;

return false;

}

//核心代码，由于STL中没有hash_map，用map来代替

map<unsigned,unsigned> ip_count;

while(inFile.good())

{

unsigned key=0;

if(inFile.read((char*)&key,sizeof(unsigned)))

{

ip_count[key]++;

}

}

map<unsigned,unsigned>::iterator it=ip_count.begin();

for(;it!=ip_count.end();it++)

{

if(it->second>result[1])

{

result[0]=it->first;

result[1]=it->second;

}

}

inFile.close();

}

return true;

}

设置缓冲区后的代码。问题描述见 “ 解题笔记（9）——提取某日访问次数最多的那个IP ”

view
plainprint ?

#pragma warning(disable:4786) //VC6.0中忽略警告

#include <fstream>

#include <iostream>

#include <map>

#include <string>

#include <ctime>

using namespace std;

const unsigned N=400000000; //随机产生的IP地址数

const unsigned FILE_NUM=16; //产生的小文件个数

const unsigned HASH_SHIFT=28; //散列值的位移量

const unsigned BUFFER_SIZE=32;

inline unsigned HashInt(unsigned value); //将整数散列到0到FILE_NUM之间

bool ProduceIP(string fileName); //随机产生IP地址，看成是32位无符号数

bool DecomposeFile(string fileName); //分而治之，将大文件分为若干个小文件

bool FindTargetIP(unsigned result[2]); //找到出现次数最多的IP

int main()

{

unsigned start,end; //记录总的运行时间

unsigned start1,end1; //产生大文件的时间

unsigned start2,end2; //分解大文件的时间

unsigned start3,end3; //找出现IP次数最多的时间

string name="IP.bin"; //大文件

unsigned result[2]={0,0}; //保存结果

start=clock();

start1=clock();

//随机产生大量IP

if(ProduceIP(name)==false)

return 1;

end1=clock();

start2=clock();

//分而治之

if(DecomposeFile(name)==false)

return 1;

end2=clock();

start3=clock();

//找到出现次数最多的IP

if(FindTargetIP(result)==false)

return 1;

end3=clock();

end=clock();

//打印结果

cout<<"total run time : "<<(end-start)/1000.0<<endl;

cout<<"ProduceIP() run time : "<<(end1-start1)/1000.0<<endl;

cout<<"DecomposeFile() run time : "<<(end2-start2)/1000.0<<endl;

cout<<"FindTargetIP() run time : "<<(end3-start3)/1000.0<<endl;

cout<<"IP : "<<(result[0]>>24)<<'.'<<((result[0]&0x00ff0000)>>16)<<'.';

cout<<((result[0]&0x0000ff00)>>8)<<'.'<<((result[0]&0x000000ff))<<endl;

cout<<"appear time : "<<result[1]<<endl;

return 0;

}

//将整数散列到0到FILE_NUM之间

inline unsigned HashInt(unsigned value)

{

//斐波那契(Fibonacci)散列法 hash_key=(value * M) >> S;

//value是16位整数，M = 40503

//value是32位整数，M = 2654435769

//value是64位整数，M = 11400714819323198485

//S与桶的个数有数，如果桶的个数为16，那么S为28

//对于32位整数，S=32-log2(桶的个数)

return (value*2654435769)>>HASH_SHIFT;

}

//随机产生IP地址看成是32位无符号数

bool ProduceIP(string fileName)

{

ofstream outFile(fileName.c_str(),ios::binary);

if(!outFile)

{

cerr<<"error: unable to open output file : "<<fileName<<endl;

return false;

}

srand(time(0));

unsigned i,j=0;

unsigned buffer[BUFFER_SIZE];

for(i=0;i<N;i++)

{

//产生一个大整数用来模拟IP地址

unsigned x=((rand()%256)<<24)|((rand()%256)<<16)|((rand()%256)<<8)|(rand()%256);

buffer[j++]=x;

if(BUFFER_SIZE==j)

{

outFile.write((char *)buffer,BUFFER_SIZE*sizeof(unsigned));

j=0;

}

}

outFile.write((char *)buffer,j*sizeof(unsigned));

return true;

}

//分而治之将大文件分为若干个小文件

bool DecomposeFile(string fileName)

{

ofstream outFiles[FILE_NUM];

int i;

for(i=0;i<FILE_NUM;i++)

{

//小文件的名称

char str[10];

string name="tmp";

itoa(i,str,10);

name=name+str+".bin";

//打开小文件

outFiles[i].open(name.c_str(),ios::binary);

if(!outFiles[i])

{

cerr<<"error: unable to open output file :"<<name<<endl;

return false;

}

}

ifstream inFile(fileName.c_str(),ios::binary);

unsigned buffer[FILE_NUM][BUFFER_SIZE];

unsigned j[FILE_NUM]={0};

while(inFile.good())

{

unsigned value;

if(inFile.read((char*)&value,sizeof(unsigned)))

{

unsigned h=HashInt(value);

buffer[h][j[h]++]=value;

if(BUFFER_SIZE==j[h])

{

outFiles[h].write((char *)buffer[h],BUFFER_SIZE*sizeof(unsigned));

j[h]=0;

}

}

}

for(i=0;i<FILE_NUM;i++)

outFiles[i].write((char *)buffer[i],j[i]*sizeof(unsigned));

//关闭文件

inFile.close();

for(i=0;i<FILE_NUM;i++)

outFiles[i].close();

return true;

}

//找到出现次数最多的IP

bool FindTargetIP(unsigned result[2])

{

result[0]=0;

result[1]=0;

for(int i=0;i<FILE_NUM;i++)

{

char str[10];

string name="tmp";

itoa(i,str,10);

name=name+str+".bin";

//处理每个小文件

ifstream inFile;

inFile.open(name.c_str(),ios::binary);

if(!inFile)

{

cerr<<"error: unable to open input file :"<<name<<endl;

return false;

}

//核心代码，由于STL中没有hash_map，用map来代替

map<unsigned,unsigned> ip_count;

while(inFile.good())

{

unsigned buffer[BUFFER_SIZE];

int readNum=0;

inFile.read((char*)buffer,BUFFER_SIZE*sizeof(unsigned));

readNum=inFile.gcount()>>2;

for(int j=readNum;j>0;j--)

{

ip_count[buffer[j-1]]++;

}

}

map<unsigned,unsigned>::iterator it=ip_count.begin();

for(;it!=ip_count.end();it++)

{

if(it->second>result[1])

{

result[0]=it->first;

result[1]=it->second;

}

}

inFile.close();

}

return true;

}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航