您的位置：首页 > 其它

用hash_map统计出现次数最多的前N个URL

2013-11-25 18:24 323 查看

海量数据统计频率最高词汇的常规办法之一是先通过一个hash函数处理数据然后取模N，拆分为N个小文件，对每一个小文件进行词频统计和排序处理，然后归并N个小文件取频率最大的M个数。

下面程序是利用hash_map处理小文件词频的实现（堆排序部分的代码没加上，可以参见http://blog.csdn.net/wodet/article/details/16948511）

关于hash_map和map的选择使用有几点注意的，hash_map是hash表的形式实现的，map是红黑树的结构，时间复杂度前者为N*(logN)，后者为O(log2N)以内.从稳定性来说map占优,从平均性能来看hash_map占优，还有hash_map目前没有纳入C++标准库，但是各个版本的STL都提供了实现。具体情况具体选择咯。。

#include <iostream>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <hash_map.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

using namespace std;

class HashFunction {
public:
size_t operator()(const string& s) const {
unsigned long __h=0;
for(size_t i=0;i<s.size();i++)
__h=5*__h+s[i];
return size_t(__h);
}
};

class Compare {
public:
bool operator()(const string& str1,const string& str2)const {
return str1==str2;
}
};

typedef hash_map<string,int,HashFunction,Compare> HashMap;

int main(int argc, char* argv[]) {
printf("%s","-=-=-=-=-=-=-=-=-=-=hash_map测试-=-=-=-=-=-=-=-=-=-=-=-=\n");
HashMap obj;
/*
obj["10010"]="联通客服";
obj["10086"]="移动客服";
obj["1368351111"]="电话号码";
obj["123456"]="你的密码";
*/
//构造关键字与次数的hash_map，即统计词频
int ai[]={22,41,22,46,13,13,22,44,44};
for(int i=0;i<9;i++) {
char aa[12]={0};
sprintf(aa,"%d",ai[i]);
obj[aa]++;
cout<<aa<<" ,count="<<obj[aa]<<endl;
}
//将hash_map数据放入结构数组里
struct tmp {
int count;
char str[12];
};
struct tmp stmp[9];
memset(stmp,0x0,sizeof(tmp)*9);
hash_map<string,int,HashFunction,Compare>::iterator itor=obj.begin();
int j=0;
for(;itor!=obj.end();itor++,j++) {
sprintf(stmp[j].str,"%s",itor->first.c_str());
stmp[j].count=itor->second;
cout<<stmp[j].str<<"	"<<stmp[j].count<<endl;
}
//可以根据堆排序stmp[]数组,取前N个最多出现的字段
//省略
return 0;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： hash_map 海量数据处理 hash_map实现词频统计堆排序

相关文章推荐

新的分享

章节导航