您的位置:首页 > 编程语言 > Java开发

基于Java使用Flink读取CSV文件,针对批处理,多表联合两种方式Table类和Join方法的实现数据处理,再入CSV文件...

2020-01-15 00:58 447 查看

Maven依赖

源头

<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>1.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>1.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies>

改版

<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_2.11</artifactId>
<version>1.7.2</version>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies>

SQL语句

SELECT COUNT(*) FROM T13_REF_AIRPORT_SAT;--11008
--HUB_ID IATA_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_AIRPORT_CITY_LINK;--9676
--*******LINK_ID AIRPORT_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_CITY_SAT;--9624
--HUB_ID CITY_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_CITY_COUNTRY_LINK;--9062
--*******LINK_ID COUNTRY_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_COUNTRY_SAT;--356
--HUB_ID  COUNTRY_CD NAME_CN NAME_EN

SELECT *
FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
AND X2.CITY_HUB_ID=X3.HUB_ID
AND X3.HUB_ID=X4.CITY_HUB_ID
AND X4.COUNTRY_HUB_ID=X5.HUB_ID;

SELECT COUNT(*)
FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
AND X2.CITY_HUB_ID=X3.HUB_ID
AND X3.HUB_ID=X4.CITY_HUB_ID
AND X4.COUNTRY_HUB_ID=X5.HUB_ID;--16759

SELECT X5.NAME_CN COUNTRY_CN_NAME,COUNT(X1.HUB_ID) COUNT_AIRPORT
FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
AND X2.CITY_HUB_ID=X3.HUB_ID
AND X3.HUB_ID=X4.CITY_HUB_ID
AND X4.COUNTRY_HUB_ID=X5.HUB_ID
GROUP BY X5.NAME_CN
ORDER BY COUNT_AIRPORT DESC;--254

SELECT
X5.COUNTRY_CD,
X5.NAME_CN COUNTRY_NAME_CN,
X5.NAME_EN COUNTRY_NAME_EN,
X3.CITY_CD,
X3.NAME_CN CITY_CN_NAME,
X3.NAME_EN CITY_EN_NAME,
COUNT(X1.HUB_ID) COUNT_AIRPORT
FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
AND X2.CITY_HUB_ID=X3.HUB_ID
AND X3.HUB_ID=X4.CITY_HUB_ID
AND X4.COUNTRY_HUB_ID=X5.HUB_ID
GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
ORDER BY COUNT_AIRPORT DESC;--13030

SELECT
X5.COUNTRY_CD,
X5.NAME_CN COUNTRY_NAME_CN,
X5.NAME_EN COUNTRY_NAME_EN,
X3.CITY_CD,
X3.NAME_CN CITY_CN_NAME,
X3.NAME_EN CITY_EN_NAME,
COUNT(X1.HUB_ID) COUNT_AIRPORT
FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
AND X2.CITY_HUB_ID=X3.HUB_ID
AND X3.HUB_ID=X4.CITY_HUB_ID
AND X4.COUNTRY_HUB_ID=X5.HUB_ID
AND X3.NAME_EN IS NULL
GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
ORDER BY COUNT_AIRPORT DESC;

--COUNTRY_NAME_EN=NULL 19
--CITY_CN_NAME=NULL 1
--CITY_EN_NAME=NULL 1501
[/code]

Airport_Sat

import lombok.Data;

@Data
public class AirportSat
{
private String hub_id;
}
[/code]

Airport_City_Link

import lombok.Data;

@Data
public class AirportCityLink
{
private String airport_hub_id;
private String city_hub_id;
}
[/code]

City_Sat

import lombok.Data;

@Data
public class CitySat
{
private String hub_id;
private String city_cd;
private String name_cn;
private String name_en;
}
[/code]

City_Country_Link

import lombok.Data;

@Data
public class CityCountryLink
{
private String country_hub_id;
private String city_hub_id;
}
[/code]

Country_Sat

import lombok.Data;

@Data
public class CountrySat
{
private String hub_id;
private String country_cd;
private String name_cn;
private String name_en;
}
[/code]

Flink_Csv


点击查看Flink_Csv代码

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple7;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;

import java.text.SimpleDateFormat;
import java.util.Date;

public class FlinkCsv
{
public static void main(String[] args) throws Exception
{
long s4 = System.currentTimeMillis();
t4();
System.out.println((System.currentTimeMillis() - s4) + "u");
long s5 = System.currentTimeMillis();
t5();
System.out.println((System.currentTimeMillis() - s5) + "d");
}

private static void t5() throws Exception
{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
BatchTableEnvironment table_env = BatchTableEnvironment.getTableEnvironment(env);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
.pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
.pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
.pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
.pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
.pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

table_env.registerTable("t13_ref_airport_sat", table_env.fromDataSet(data_airportsat));
table_env.registerTable("t13_ref_airport_city_link", table_env.fromDataSet(data_airportcitylink));
table_env.registerTable("t13_ref_city_sat", table_env.fromDataSet(data_citysat));
table_env.registerTable("t13_ref_city_country_link", table_env.fromDataSet(data_citycountrylink));
table_env.registerTable("t13_ref_country_sat", table_env.fromDataSet(data_countrysat));

String sql = "select count(*) \n" +
"\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
"\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
"\twhere x1.hub_id=x2.airport_hub_id\n" +
"\t\tand x2.city_hub_id=x3.hub_id\n" +
"\t\tand x3.hub_id=x4.city_hub_id\n" +
"\t\tand x4.country_hub_id=x5.hub_id";

String sql_country = "select x5.name_cn country_cn_name,count(x1.hub_id) count_airport\n" +
"\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
"\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
"\twhere x1.hub_id=x2.airport_hub_id\n" +
"\t\tand x2.city_hub_id=x3.hub_id\n" +
"\t\tand x3.hub_id=x4.city_hub_id\n" +
"\t\tand x4.country_hub_id=x5.hub_id\n" +
"\tgroup by x5.name_cn\n" +
"\torder by count_airport desc";

String sql_all = "select \n" +
"\tx5.country_cd,\n" +
"\tx5.name_cn country_name_cn,\n" +
"\tx5.name_en country_name_en,\n" +
"\tx3.city_cd,\n" +
"\tx3.name_cn city_cn_name,\n" +
"\tx3.name_en city_en_name,\n" +
"count(x1.hub_id) count_airport\n" +
"\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
"\twhere x1.hub_id=x2.airport_hub_id\n" +
"\t\tand x2.city_hub_id=x3.hub_id\n" +
"\t\tand x3.hub_id=x4.city_hub_id\n" +
"\t\tand x4.country_hub_id=x5.hub_id\n" +
"\tgroup by x5.country_cd,x5.name_cn,x5.name_en,x3.city_cd,x3.name_cn,x3.name_en\n" +
"\torder by count_airport desc";

DataSet<Tuple1<Long>> map = table_env.toDataSet(table_env.sqlQuery(sql),
TypeInformation.of(new TypeHint<Tuple1<Long>>()
{
}));
map.print();

DataSet<Tuple2<String, Long>> map_country = table_env.toDataSet(table_env.sqlQuery(sql_country),
TypeInformation.of(new TypeHint<Tuple2<String, Long>>()
{
}));
System.out.println(map_country.count());
map_country.print();

Table result_country = table_env.sqlQuery(sql_country);
DataSet<Tuple7<String, String, String, String, String, String, Long>> map_all = table_env.toDataSet(table_env.sqlQuery(sql_all),
TypeInformation.of(new TypeHint<Tuple7<String, String, String, String, String, String, Long>>()
{
}));
System.out.println(map_all.count());
map_all.print();

map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("T打印完成______map...");
map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("T打印完成______map_country...");
map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("T打印完成______map_all...");

env.execute("Hello!@ Fuck...");
}

private static void t4() throws Exception
{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
.pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
.pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
.pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
.pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
.fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
.pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

MapOperator<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
Tuple7<String, String, String, String, String, String, Long>> map = data_airportsat
.join(data_airportcitylink).where("hub_id").equalTo("airport_hub_id")
.join(data_citysat).where(new KeySelector<Tuple2<AirportSat, AirportCityLink>, String>()
{
@Override
public String getKey(Tuple2<AirportSat, AirportCityLink> t) throws Exception
{
return t.f1.getCity_hub_id();
}
}).equalTo("hub_id")
.join(data_citycountrylink).where(new KeySelector<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, String>()
{
@Override
public String getKey(Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat> t) throws Exception
{
return t.f1.getHub_id();
}
}).equalTo("city_hub_id")
.join(data_countrysat).where(new KeySelector<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, String>()
{
@Override
public String getKey(Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink> t) throws Exception
{
return t.f1.getCountry_hub_id();
}
}).equalTo("hub_id")
.map(new MapFunction<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
Tuple7<String, String, String, String, String, String, Long>>()
{

@Override
public Tuple7<String, String, String, String, String, String, Long> map(
Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat> t) throws Exception
{
String country_cd = t.f1.getCountry_cd();
String country_cn_name = t.f1.getName_cn();
String country_en_name = t.f1.getName_en();
String city_cd = t.f0.f0.f1.getCity_cd();
String city_cn_name = t.f0.f0.f1.getName_cn();
String city_en_name = t.f0.f0.f1.getName_en();
long airport = 1L;
return new Tuple7<>(country_cd, country_cn_name, country_en_name, city_cd, city_cn_name, city_en_name, airport);
}
});
//--------------------------------------------------------------------------------------------------------------
System.out.println("总数量: " + map.count());
SortPartitionOperator<Tuple2<String, Long>> map_country = map
.map(new MapFunction<Tuple7<String, String, String, String, String, String, Long>, Tuple2<String, Long>>()
{
@Override
public Tuple2<String, Long> map(Tuple7<String, String, String, String, String, String, Long> t) throws Exception
{
return new Tuple2<>(t.f1, t.f6);
}
}).groupBy(0).sum(1).sortPartition(1, Order.DESCENDING);
System.out.println("国家分总数量: " + map_country.count());
//map_country.print();
SortPartitionOperator<Tuple7<String, String, String, String, String, String, Long>> map_all = map
.groupBy(0, 1, 2, 3, 4, 5).sum(6).sortPartition(6, Order.DESCENDING);
System.out.println("全分总数量: " + map_all.count());
//map_all.print();

map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("打印完成______map...");
map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("打印完成______map_country...");
map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
"\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
System.out.println("打印完成______map_all...");

env.execute("Hello!@ Fuck...");
}
}
[/code]
  • 点赞
  • 收藏
  • 分享
  • 文章举报
陶攀峰 发布了62 篇原创文章 · 获赞 2 · 访问量 1977 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐