您的位置:首页 > 数据库 > MySQL

搜狗引擎查询日志的数据入库(Mysql…

2015-10-14 10:07 585 查看
为了进行hive与spark的开发,所以想以某个大规模数据集进行测试,找到了搜狗引擎的日志数据,网上公开的应该有一个月的数据,差不多为5000多万条,做测试应该是满足要求的。

搜索引擎查询日志库设计为包括约1个月(2008年6月)Sogou搜索引擎部分网页查询需求及用户点击情况的网页查询日志数据集合。为进行中文搜索引擎用户行为分析的研究者提供基准研究语料。(网址为:http://www.sogou.com/labs/dl/q.html)

在百度云盘上找到了一个分享,于是转载到自己云盘里,也在这边分享一下:链接:http://pan.baidu.com/s/12VPue
密码:jn39。

做数据入库到mysql,由于一直在ubuntu环境上做实验,于是采用eclipse +
java来开发,虽然效率比较低,但是将就用吧。下附主要代码。其中由于日志采用文本行的方式来处理,对文本的切割有些地方会报错,因此采取一些简单的策略直接滤掉一些不满足要求的。并迁移到hive做下实验,效率还是挺高的。

public static void main(String [] args) {

// The name of the file to open.

String fileName =
"/home/Downloads/SougouQ/access_log.20080629.decode.filter";

// The name of the file to open.

String newfileName =
"/home/Downloads/SougouQ/20080629.csv";

// This will reference one line at a time

String line = null;

String filePath =
"/home/Downloads/SougouQ";

File logifles = new File(filePath);

File[] fs = logifles.listFiles();

String dateString ="",newdatestring;

Connection
con = null;

Statement
st = null;

String url
= "jdbc:mysql://127.0.0.1:3306/dblog";

String
user = "root";

String
password = "ndscbigdata";

try

{

//## connect DB

// con =
DriverManager.getConnection(url, user, password);

con=DriverManager.getConnection(url+"?user="+user+"&password="+password+"&useUnicode=true&characterEncoding=utf-8");

System.out.println("connect db success!");

//
String
insertsql = "INSERT INTO
sougouquery(visitTime,userID,
visitKeyword,rankIndex,clickIndex,clickUrl)"
//
+ "
values(?,?,?,?,?,?)";

String
insertsql2 = "INSERT INTO
sougouquery(visitTime,userID,
visitKeyword,rankIndex,clickIndex,clickUrl)
values('%s','%s','%s',%d,%d,'%s')";

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

String
sqlex="";

PreparedStatement preparedStmt = null;

String[]
parts = null;

//## for
each files (30 days log file)

for(int
i=0; i

{

fileName =
fs[i].getName();

if(fileName.length()<10)

continue;

System.out.println(fs[i].getAbsolutePath());

System.out.println(fileName);

//##
extract date(20080601)

dateString
= fileName.substring(11, 19);

newdatestring = String.format("%s-%s-%s",
dateString.substring(0,4),dateString.substring(4,6),dateString.substring(6,8));

System.out.println(dateString);

System.out.println(newdatestring);

//
FileReader reads text files in the default encoding.

//## read
file

FileInputStream fis = new
FileInputStream(fs[i].getAbsolutePath());

InputStreamReader isr = new InputStreamReader(fis,
"GB2312");

BufferedReader bufferedReader =
new BufferedReader(isr);
//

//##
new file name

newfileName
= filePath+"/"+dateString+".csv";

System.out.println(newfileName);

//## write
file
//

FileOutputStream fos = new
FileOutputStream(newfileName);
//

OutputStreamWriter osw = new
OutputStreamWriter(fos, "GB2312");
//

BufferedWriter bufferedWriter = new
BufferedWriter(osw);

while((line
= bufferedReader.readLine()) != null)

{

//System.out.println(line);
//

bufferedWriter.write(newdatestring +" "+line);
//

System.out.println(newdatestring +"
"+line);
//

bufferedWriter.newLine();
//

break;

//
java.sql.Date startDate = new java.sql.Date()

parts =
line.split("\\s");

if(parts.length!=6)

continue;

// the mysql insert
statement

// create the mysql insert
preparedstatement
//

preparedStmt =
con.prepareStatement(insertsql);
//

preparedStmt.setDate(1,sdf.parse(newdatestring+"
"+parts[0]));
//

preparedStmt.setString
(2,parts[1]);
//

preparedStmt.setString (3,
parts[2]);
//

preparedStmt.setInt(4,
Integer.parseInt(parts[3]));
//

preparedStmt.setInt(5,
Integer.parseInt(parts[4]));
//

preparedStmt.setString (6,
parts[5]);

sqlex =
String.format(insertsql2, newdatestring+"
"+parts[0],parts[1],parts[2],Integer.parseInt(parts[3]),Integer.parseInt(parts[4]),parts[5]);

try

{

st =
con.createStatement();

//System.out.println(sqlex);

st.executeUpdate(sqlex);

// execute the
preparedstatement

//preparedStmt.execute();

}

catch (SQLException ex)
{

System.out.println(ex.getMessage());

continue;

}

}

//
// Always
close files.

bufferedReader.close();

//
// Always close files.
//
bufferedWriter.close();

}

}

// Note that write() does not
automatically

// append a newline character.
//
bufferedWriter.write("Hello there,");
//
bufferedWriter.write(" here is some
text.");
//
bufferedWriter.newLine();
//
bufferedWriter.write("We are writing");
//
bufferedWriter.write(" the text to the
file.");

catch(FileNotFoundException ex) {

System.out.println(

"Unable to open file '"
+

fileName + "'");

}

catch(IOException ex) {

System.out.println(

"Error reading file
'"

+ fileName + "'");

// Or we
could just do this:

//
ex.printStackTrace();

}

catch (SQLException ex) {

System.out.println(ex.getMessage());

}

finally {

try
{

if (st != null) {

st.close();

}

if (con != null) {

con.close();

}

} catch
(SQLException ex) {

System.out.println(ex.getMessage());

}

}
}



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: