您的位置:首页 > 数据库

2018-08-24期 基于HBase数据库的MapReduce编程案例

2018-08-24 09:01 363 查看

package cn.songjq.hbase.mr.emp;

import java.io.BufferedReader;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.Iterator;

import java.util.List;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.HColumnDescriptor;

import org.apache.hadoop.hbase.HTableDescriptor;

import org.apache.hadoop.hbase.MasterNotRunningException;

import org.apache.hadoop.hbase.TableName;

import org.apache.hadoop.hbase.ZooKeeperConnectionException;

import org.apache.hadoop.hbase.client.HBaseAdmin;

import org.apache.hadoop.hbase.client.HTable;

import org.apache.hadoop.hbase.client.Mutation;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Result;

import org.apache.hadoop.hbase.client.ResultScanner;

import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException;

import org.apache.hadoop.hbase.client.Scan;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

import org.apache.hadoop.hbase.mapreduce.TableMapper;

import org.apache.hadoop.hbase.mapreduce.TableReducer;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.io.FloatWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.junit.Test;

/**

* 基于HBase数据库上的MapReduce程序编程案例,主要实现以下功能

* 1、读取HBase中员工emp表数据

* 2、对读取的emp表数据进行部门平均工资处理

* 3、将处理后的结果输出到HBase表中

*

* 所有Mapper,Reducer、Job均采用匿名内部类实现

* @author songjq

*

*/

public class DeptAvgSalary {

/**

* 数据准备

* 进行MapReduce程序编写前,需要将关系型数据库Emp表csv文件数据插入到HBase emp表中,并创建存放处理结果的表statistics

* @throws IOException

* @throws ZooKeeperConnectionException

* @throws MasterNotRunningException

*/

@Test

public void init() throws MasterNotRunningException, ZooKeeperConnectionException, IOException {

//获取HBase数据库连接对象

Configuration conf = HBaseConfiguration.create();

conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");

HBaseAdmin hbaseAdmin = new HBaseAdmin(conf); //创建emp表并导入emp.csv文件数据

if(!hbaseAdmin.tableExists("emp")) {

//表不存在,创建表,并初始化表的数据

TableName tableName = TableName.valueOf("emp");

//表描述符

HTableDescriptor empTable = new HTableDescriptor(tableName);

//列描述符

HColumnDescriptor f1 = new HColumnDescriptor("empinfo");

//保留3个版本数据

f1.setMaxVersions(3);

empTable.addFamily(f1);

hbaseAdmin.createTable(empTable);

//初始化数据

HTable hTable = new HTable(conf, "emp");

putDataToEmp(hTable);

}

//创建statistics表

if(!hbaseAdmin.tableEx 3eac ists("statistics")) {

//表不存在,创建表,并初始化表的数据

TableName tableName = TableName.valueOf("statistics");

//表描述符

HTableDescriptor statisticsTable = new HTableDescriptor(tableName);

//列描述符

HColumnDescriptor f1 = new HColumnDescriptor("emp_stat");

//保留3个版本数据

f1.setMaxVersions(3);

statisticsTable.addFamily(f1);

hbaseAdmin.createTable(statisticsTable);

}

hbaseAdmin.close();

    }

/**

* 插入数据

* @throws IOException

* @throws RetriesExhaustedWithDetailsException

*/

public static void putDataToEmp(HTable hTable) throws RetriesExhaustedWithDetailsException, IOException {

List<Put> puts = new ArrayList<>();

//构造一个数据流,emp表的csv文件

FileInputStream filein = new FileInputStream("D:\\test\\hbase\\emp.csv");

InputStreamReader fileinReader = new InputStreamReader(filein);

BufferedReader br = new BufferedReader(fileinReader);

String line = null;

while((line=br.readLine())!=null) {

//得到一行数据

String[] split = line.split(",");

//7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30

int empno = Integer.valueOf(split[0]);

String ename = split[1];

String job = split[2];

int mgr = 0;

try {

mgr = Integer.valueOf(split[3]);

}catch (Exception e) {

mgr = 0;

}

String hiredate = split[4];

float salary = Float.valueOf(split[5]);

float comm = 0f;

try {

comm = Float.valueOf(split[6]);

}catch (Exception e) {

comm = 0f;

}

int deptno = Integer.valueOf(split[7]);

//行键rowkey

Put put = new Put(Bytes.toBytes(empno));

//列族

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("ename"), Bytes.toBytes(ename));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("job"), Bytes.toBytes(job));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("mgr"), Bytes.toBytes(mgr));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("hiredate"), Bytes.toBytes(hiredate));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("salary"), Bytes.toBytes(salary));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("comm"), Bytes.toBytes(comm));

put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno"), Bytes.toBytes(deptno));

puts.add(put);

}

hTable.put(puts);

br.close();

fileinReader.close();

filein.close();

}

/**

* Mapper端:

* 这里不继承Mapper类,而是集成TableMapper<KEYOUT, VALUEOUT>

* KEYOUT:输出k2

* VALUEOUT:输出v2

* 由于输入是HBase的一个rowkey记录,因此没有<k1,v1>

* @author songjq

*

*/

static class DeptAvgSalaryMapper extends TableMapper<IntWritable, FloatWritable> {

/*

* 没输入一条rowkey记录,就调用一次map方法

* (non-Javadoc)

* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)

*/

@Override

protected void map(ImmutableBytesWritable rowkey, Result rs,

Context context)

throws IOException, InterruptedException {

//获取员工部门号

int deptno = Bytes.toInt(rs.getValue(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno")));

//获取员工薪水

float salary = Bytes.toFloat(rs.getValue(Bytes.toBytes("empinfo"), Bytes.toBytes("salary")));

//将获取结果写出去

context.write(new IntWritable(deptno), new FloatWritable(salary));

}

}

/**

* Reducer端:

* 这里不继承Reducer类,而是继承HBase的TableReducer<KEYIN, VALUEIN, KEYOUT>类

* KEYIN:Mapper输出k2

* VALUEIN:Mapper输出v2集合

* KEYOUT:Reducer的输出,按照rowkey输出到HBase表

* @author songjq

*

*/

static class DeptAvgSalaryReducer extends TableReducer<IntWritable, FloatWritable, ImmutableBytesWritable> {

/*

* 相同的key会调用一次reduce方法

* (non-Javadoc)

* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)

*/

@Override

protected void reduce(IntWritable k3, Iterable<FloatWritable> v3,

Context ctx)

throws IOException, InterruptedException {

//定义部门平均工资

float deptAvgSal = 0f;

float deptTotalSal = 0f;

int count = 0;

Iterator<FloatWritable> iterator = v3.iterator();

while(iterator.hasNext()) {

FloatWritable sal = iterator.next();

deptTotalSal += sal.get();

count++;

}

//求部门平均工资

deptAvgSal = deptTotalSal/count;

/*

* 将处理结果写入HBase statistics表

*/

//创建一个Put对象,并指定rowkey的值,这里使用部门号deptno作为rowkey

Put put = new Put(Bytes.toBytes(k3.get()));

//往put对象添加插入的列数据 emp_stat->列族  dept_avg_sal->部门平均工资列

put.add(Bytes.toBytes("emp_stat"), Bytes.toBytes("dept_avg_sal"), Bytes.toBytes(deptAvgSal));

//输出到HBase,指定输出的rowkey,输出put

ctx.write(new ImmutableBytesWritable(Bytes.toBytes(k3.get())), put);

}

}

/**

* Main方法提交Job

* 提交到Hadoop可能会报错Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/HTable

* 因此报此错误需要设置export HADOOP_CLASSPATH=$HBASE_HOME/lib/*:$CLASSPATH

* @param args

* @throws Exception

*/

public static void main(String[] args) throws Exception {

//配置HBase客户端连接信息

Configuration conf = HBaseConfiguration.create();

conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");

Job job = Job.getInstance(conf);

job.setJarByClass(DeptAvgSalary.class);

//指定任务的Mapper

/*

* TableMapReduceUtil.initTableMapperJob(

* table, 输入的表

* scan, 指定扫描器

* mapper, 指定Mapper类

* outputKeyClass, 输出key类型

* outputValueClass, 输出value类型

* job 任务job

* );

*/

//创建一个扫描器

Scan scan = new Scan();

//指定要输入的列,如果列太多,不建议查询出来做为输入,根据要处理的列进行输入,这里只查询deptno和salary作为mapper的输入

scan.addColumn(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno"));

scan.addColumn(Bytes.toBytes("empinfo"), Bytes.toBytes("salary"));

TableMapReduceUtil.initTableMapperJob(Bytes.toBytes("emp"),

 scan,

 DeptAvgSalaryMapper.class,

 IntWritable.class,

 FloatWritable.class,

 job);

//指定任务的Reducer

/*

* TableMapReduceUtil.initTableReducerJob(

*  table, 输出的表

*  reducer, Reducer类

*  job) 任务job

*/

TableMapReduceUtil.initTableReducerJob(

"statistics",

DeptAvgSalaryReducer.class,

job);

//提交任务

job.waitForCompletion(true);

}

/**

* 查看输出到HBase中的数据

* 执行结果:

* 部门编号 平均工资

10 2916.6667

20 2576.5

30 1729.5834

* @throws Exception

*/

@Test

public void scanStatisticsInfo() throws Exception {

Configuration conf = HBaseConfiguration.create();

conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");

HTable hTable = new HTable(conf, "statistics");

Scan scan = new Scan();

ResultScanner scanner = hTable.getScanner(scan);

Iterator<Result> iterator = scanner.iterator();

System.out.println("部门编号\t\t平均工资");

while(iterator.hasNext()) {

Result rs = iterator.next();

System.out.println(Bytes.toInt(rs.getRow())+"\t\t"+

Bytes.toFloat(rs.getValue(Bytes.toBytes("emp_stat"), Bytes.toBytes("dept_avg_sal"))));

}

hTable.close();

}

}


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  HBase MapReduce