您的位置:首页 > 编程语言 > Java开发

JAVA xml 流方式读取。数据挖掘大文件预处理。

2014-04-01 20:09 447 查看
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
* @author gjf
*db_pre.arff存储的是从xml文件中读取的xml信息
*/
public class ElmAuth {
Map<String, Integer> map = new HashMap<String, Integer>();

//第一步
//从xml文件中提取  读取xml文件中的author信息,写到db_pre.arff,并且替换特殊字符
public void settleXml(String src, String dst){//src=dblp.xml dst=db_pre.arff
File file = new File(src);
File fl = new File(dst);
FileReader fr;
try {
fr = new FileReader(file);
FileWriter fw = new FileWriter(fl);
BufferedReader br = new BufferedReader(fr);
BufferedWriter bw = new BufferedWriter(fw);
String line = null;
boolean flag = true;
int loc_st;
int loc_end;
int len = 0, max = 0;
while((line = br.readLine()) != null){
if(line == null)
break;
loc_st = line.indexOf("<author>");
if(loc_st != -1){
loc_end = line.indexOf("</author>");
line = line.substring(loc_st + 8, loc_end);//在<author></author>之间的数据,一个作者的名字
line=line.replace('&', ' ');
line=line.replace('$', ' ');
line=line.replace("' "," ");
line=line.replace("'", " ");
/*flag以文章为界限,在同一篇文章内,flag=false,写入在同一行*/
if(flag){
bw.write("\n");
bw.write(line);
} else {
bw.write(",");
bw.write(line);
}
len++;//每写一个作者,计数加 +1
flag = false;
} else {
flag = true;
if(max < len) max = len;//选择最大的len;
len = 0;
bw.flush();
}
}
System.out.println("第一步 论文中具有最大的作者数:" + max);
} catch (IOException e) {
e.printStackTrace();
}
}

//消除只有单个作item
//第二步:将作者的信息db_pre.arff中只有一个作者的数据删除
public void elimate_one(String src, String dst){//src=db_pre.arff dst=db_elone.arff
try {
File file = new File(src);
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
File filew = new File(dst);
FileWriter fw = new FileWriter(filew);
BufferedWriter bw = new BufferedWriter(fw);
Map<String, Integer> map = new HashMap<String, Integer>();
String line = null;
int k = 1;
int res = 0;
while((line = br.readLine()) != null){
String[] arrLine = line.split(",");
//作者之间用","隔离,","的数量表示作者的个数,数量比一少,则不写入.
if(arrLine.length > 1){
bw.write(line);
bw.write("\n");
res ++;
}
}
bw.flush();
br.close();
bw.close();
fr.close();
//System.out.println("The Row of the file is:" + res);
System.out.println("这篇论文中去除单个作者后的行数:" + res);
}catch (IOException e) {
e.printStackTrace();
}
}

//将剩余的作储再hashMap中,key值为人名,value为出现的次数,支持度数
public void createMap(String src){//srr=db_elone.arff
try {
File file = new File(src);
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);

String line = null;
while((line = br.readLine()) != null){
if(line == null)
break;
String[] arrLine = line.split(",");
for(int i = 0; i < arrLine.length; ++i){
if(map.get(arrLine[i]) == null){
map.put(arrLine[i], 1);
} else {
map.put(arrLine[i], map.get(arrLine[i]) + 1);
}
}
}
fr.close();
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}

//从hashMap中删除小于支持度minsup的作者,本次的支持度数为100;
public void settleMap(int minsup){
Iterator it = map.keySet().iterator();
while(it.hasNext()){
String str = (String) it.next();
if(map.get(str) < minsup){
it.remove();
}
}
System.out.println("Map的大小,支持度大于100的作者个数:" + map.size());
}

//将大于minsup的作者存储到文件 db_minsup.arff,存储的是符合筛选的作者
public void updateMap(String src, String dst){//src=db_elone.arff dst=db_minsup.arff
try {
File filer = new File(src);
FileReader fr = new FileReader(filer);
BufferedReader br = new BufferedReader(fr);

File filew = new File(dst);
FileWriter fw = new FileWriter(filew);
BufferedWriter bw = new BufferedWriter(fw);

String line = null;
int res = 0;
boolean flag = true;
while((line = br.readLine()) != null){
if(line == null)break;
String[] arrLine = line.split(",");
if(flag == false)res++;
flag = true;
for(int i = 0; i < arrLine.length; ++i){
if(map.get(arrLine[i]) != null){
if(flag == true){
bw.write("\n" + arrLine[i]);
flag = false;
} else {
bw.write("," + arrLine[i]);
}
}
}
}
bw.flush();
System.out.println("符合筛选的作者合作写的论文篇数:" + res);
fw.close();
bw.close();
fr.close();
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}

//生成weka识别的文  dst=db
public void createWekaFile(String src, String dst){//src=db_minsup.arff dst=db
try {
File filer = new File(src);
FileReader fr = new FileReader(filer);
BufferedReader br = new BufferedReader(fr);

File filew = new File(dst);
FileWriter fw = new FileWriter(filew);
BufferedWriter bw = new BufferedWriter(fw);
bw.write("@relation db" + "\n");
Iterator it = map.keySet().iterator();
while(it.hasNext()){
String str = (String) it.next();
str.replace("'", "\'");
bw.write("@attribute '" + str + "' { t}\n");
}
bw.write("@data" + "\n");

String line = null;
boolean flag = true;
while((line = br.readLine()) != null){
if(line == null)break;
flag = true;
char ch;
it = map.keySet().iterator();
while(it.hasNext()){
String str = (String)it.next();
if(line.indexOf(str) >= 0){
ch = 't';
} else {
ch = '?';
}
if(flag == true){
bw.write(ch);
} else {
bw.write("," + ch);
}
flag = false;
}
bw.write("\n");
}
bw.flush();
fw.close();
bw.close();
fr.close();
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

public void clearMap(){
map.clear();
}

public static void main(String args[]){
ElmAuth elmauth = new ElmAuth();
elmauth.settleXml("dblp.xml", "db_pre.arff");
elmauth.elimate_one("db_pre.arff", "db_elone.arff");
elmauth.createMap("db_elone.arff");
elmauth.settleMap(100);//确定最小支持度数
elmauth.updateMap("db_elone.arff", "db_minsup.arff");

for(int i = 0; i < 20; ++i){
System.out.println();
elmauth.elimate_one("db_minsup.arff", "db_minsup_elone.arff");
elmauth.clearMap();
elmauth.createMap("db_minsup_elone.arff");
elmauth.settleMap(100);
elmauth.updateMap("db_minsup_elone.arff", "db_minsup.arff");
}

elmauth.createWekaFile("db_minsup.arff", "db.arff");
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: