您的位置:首页 > 其它

TF-IDF 的计算二

2011-08-29 22:51 281 查看
TF(Term Frequency)计算公式: TFi,j = Freq i,j / max Freq j

以上公式中Freq i,j 是该词在文件dj中的出现次数,max Freq j 是在文件dj中所有字词的出现次数之和.

class TFs

{

ArrayList<HashMap<String,Double>> TFsList = new ArrayList<HashMap<String, Double>>();

ArrayList<ArrayList<String>> TFsMainFileList = new ArrayList<ArrayList<String>>();

public TFs(ArrayList<ArrayList<String>> tf)

{

TFsMainFileList = tf;

}

public ArrayList<HashMap<String,Double>> PrintTFs()

{

for(int i=0; i<TFsMainFileList.size(); i++)

{

//TermTF use to save subFile of term and value

HashMap<String,Double> TermTF = new HashMap<String,Double>();

HashMap<String,Double> saveTF = new HashMap<String,Double>();

ArrayList<String> TFsSubFileList = TFsMainFileList.get(i);

int TermMaxFreq=0;//maxcount(max freqj) is the maximum number of times any term occurs is documentj.

for(int j=0; j<TFsSubFileList.size(); j++)

{

//Take elements from arraylist<hashmap<string,Double>>

if(!TermTF.containsKey(TFsSubFileList.get(j)))

{

TermTF.put(TFsSubFileList.get(j),1.0);

}

else

{

double value = TermTF.get(TFsSubFileList.get(j));

value ++;

TermTF.put(TFsSubFileList.get(j),value);

if(value > TermMaxFreq)

{

TermMaxFreq = (int)(value);

}

}

}

for(int v=0; v<TFsSubFileList.size(); v++)

{

if(!saveTF.containsKey(TFsSubFileList.get(v)))

{

double TermFreq = TermTF.get(TFsSubFileList.get(v));//where freqi,j is the number of times term i occurs in document j

double tfs = (double)TermFreq / (double)TermMaxFreq;

saveTF.put(TFsSubFileList.get(v),tfs);

}

}

TFsList.add(saveTF);

}

return TFsList;

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: