您的位置:首页 > 数据库

GREENPLUM介绍之数据库管理(二)-创建一个演示系统

2011-05-26 23:30 471 查看
GREENPLUM介绍之数据库管理(二)-创建一个演示系统(转)
下面介绍一个把ORACLE数据库中的DEMO,SALES HISTORY移植到GREENPLUM的过程。这样大家可以在GP中有些DEMO数据,深入学习GP的各项功能。
首先,创建目录
mkdir -p /stage/sales_data
我们需要把ORACLE SH SCHEMA下的数据导成平面文件,比较快捷高效的方式是通过sqlplus完成这个工作,把用来查询的SQL写入一个文件,比如
vi batch_exp_sales.sh

sqlplus -S sh/sh <<EOF
set arraysize 10000;
set heading off;
set feeback off;
alter session set nls_date_format='YYYY-MM-DD';
select '"'||prod_id||'","'||CUST_ID||'","'||TIME_ID||'","'||CHANNEL_ID||'","'||PROMO_ID||'","'||QUANTITY_SOLD||'","'||AMOUNT_SOLD||'"' as text from sales;
set heading on;
set feedback on;
quit;
EOF
exit

chmod 777 batch_exp_sales.sh
nohup ./batch_exp_sales.sh >> sales.csv &
在生产应用中对于大表,可以在sql中加入条件,采用并行方式,按照范围导出数据,这也是从ORACLE中把数据导成文本的最快方式。你可以用这种方式把其它几张表
CHANNELS
COSTS
COUNTRIES
CUSTOMERS
PRODUCTS T
PROMOTIONS
TIMES
中的数据也进行导出。

接下来以超级管理员身份连接GREENPLUM,创建用户sh, 并赋予它登录系统和创建数据库的权限
su - gpadmin
gpadmin@mdw:~> psql -d template1
psql (8.2.15)
Type "help" for help.
template1=# create role sh with password 'sh' login createdb;
NOTICE: resource queue required -- using default resource queue "pg_default"
CREATE ROLE
template1=#
如果直接以SH登录,会得到如下错误
gpadmin@mdw:/data/gpmaster/gpseg-1> psql -d template1 -U sh -h mdw
psql: FATAL: no pg_hba.conf entry for host "192.168.10.10", user "sh", database "template1", SSL off

进入master的数据目录
vi /data/gpmaster/gpseg-1/pg_hba.conf
添加一行(请根据自己客户端的网段设置)
host all sh 192.168.10.10/32 md5

gpstop -u ,重新load该配置文件。
再以sh用户身份登录数据库
gpadmin@mdw:/data/gpmaster/gpseg-1> psql -d template1 -U sh -h mdw
Password for user sh:
psql (8.2.15)
Type "help" for help.
template1=>

接下来就可以创建自己的数据库了。创建数据库的句法如下
CREATE DATABASE name [ [WITH] [OWNER [=] dbowner]
[TEMPLATE [=] template]
[ENCODING [=] encoding]
[TABLESPACE [=] tablespace]
[CONNECTION LIMIT [=] connlimit ] ]
最简单的只要给个数据库名字就好了。比如
template1=> create database sales_history;
CREATE DATABASE
接下来可以,连接到建好的数据库上创建schema,如果不创建自己的schema,在建库时系统创建了一个叫做public的schema。在GP中,如果要跨schema访问对象,
必须使用schema做前缀,比如schema.table的形式。如果不加前缀默认查找当前schema。
template1=> /c sales_history
You are now connected to database "sales_history" as user "sh".
sales_history=> create schema sales_history ;
CREATE SCHEMA
如果要永久改变schema的访问顺序,用下面的语句
sales_history=> alter database sales_history set search_path to sales_history,public;
ALTER DATABASE

如果是改变当前客户端的schema 访问顺序
sales_history=> set search_path to sales_history;
SET

下面就可以建表了,注意一下映射关系,一般oracle number数据类型对应gp的numeric类型,oracle date数据类型对应gp的timestamp类型,oracle varchar2数据类型对应gp的varchar类型,另外,GP在建表的时候,还应该指定数据的分布方法。一种是HASH算法,一种是round-robin(另写文章对GP的表进行详细介绍),例子全部使用HASH方式建表。比如sales_demo表的GP建句法。

CREATE TABLE SALES
( PROD_ID numeric NOT NULL ,
CUST_ID numeric NOT NULL ,
TIME_ID DATE NOT NULL ,
CHANNEL_ID numeric NOT NULL ,
PROMO_ID numeric NOT NULL ,
QUANTITY_SOLD numeric(10,2) NOT NULL ,
AMOUNT_SOLD numeric(10,2) NOT NULL )
distributed by (prod_id,cust_id,time_id,channel_id,promo_id);

CREATE TABLE TIMES
( TIME_ID DATE NOT NULL,
DAY_NAME VARCHAR(9) NOT NULL,
DAY_NUMBER_IN_WEEK numeric(1,0) NOT NULL,
DAY_NUMBER_IN_MONTH numeric(2,0) NOT NULL,
CALENDAR_WEEK_NUMBER numeric(2,0) NOT NULL,
FISCAL_WEEK_NUMBER numeric(2,0) NOT NULL,
WEEK_ENDING_DAY DATE NOT NULL,
WEEK_ENDING_DAY_ID numeric NOT NULL,
CALENDAR_MONTH_NUMBER numeric(2,0) NOT NULL,
FISCAL_MONTH_NUMBER numeric(2,0) NOT NULL,
CALENDAR_MONTH_DESC VARCHAR(8) NOT NULL,
CALENDAR_MONTH_ID numeric NOT NULL,
FISCAL_MONTH_DESC VARCHAR(8) NOT NULL,
FISCAL_MONTH_ID numeric NOT NULL,
DAYS_IN_CAL_MONTH numeric NOT NULL,
DAYS_IN_FIS_MONTH numeric NOT NULL,
END_OF_CAL_MONTH DATE NOT NULL,
END_OF_FIS_MONTH DATE NOT NULL,
CALENDAR_MONTH_NAME VARCHAR(9) NOT NULL,
FISCAL_MONTH_NAME VARCHAR(9) NOT NULL,
CALENDAR_QUARTER_DESC CHAR(7) NOT NULL,
CALENDAR_QUARTER_ID numeric NOT NULL,
FISCAL_QUARTER_DESC CHAR(7) NOT NULL,
FISCAL_QUARTER_ID numeric NOT NULL,
DAYS_IN_CAL_QUARTER numeric NOT NULL,
DAYS_IN_FIS_QUARTER numeric NOT NULL,
END_OF_CAL_QUARTER DATE NOT NULL ,
END_OF_FIS_QUARTER DATE NOT NULL ,
CALENDAR_QUARTER_NUMBER numeric(1,0) NOT NULL,
FISCAL_QUARTER_NUMBER numeric(1,0) NOT NULL,
CALENDAR_YEAR numeric(4,0) NOT NULL,
CALENDAR_YEAR_ID numeric NOT NULL ,
FISCAL_YEAR numeric(4,0) NOT NULL,
FISCAL_YEAR_ID numeric NOT NULL,
DAYS_IN_CAL_YEAR numeric NOT NULL,
DAYS_IN_FIS_YEAR numeric NOT NULL,
END_OF_CAL_YEAR DATE NOT NULL ,
END_OF_FIS_YEAR DATE NOT NULL)
distributed by (time_id);

CREATE TABLE CHANNELS
(CHANNEL_ID numeric NOT NULL,
CHANNEL_DESC VARCHAR(20) NOT NULL,
CHANNEL_CLASS VARCHAR(20) NOT NULL,
CHANNEL_CLASS_ID numeric NOT NULL,
CHANNEL_TOTAL VARCHAR(13) NOT NULL,
CHANNEL_TOTAL_ID numeric NOT NULL,
CONSTRAINT CHANNELS_PK PRIMARY KEY (CHANNEL_ID))
distributed by (channel_id);

CREATE TABLE COSTS
( PROD_ID numeric NOT NULL,
TIME_ID DATE NOT NULL,
PROMO_ID numeric NOT NULL ,
CHANNEL_ID numeric NOT NULL,
UNIT_COST numeric(10,2) NOT NULL ,
UNIT_PRICE numeric(10,2) NOT NULL)
distributed by (prod_id,time_id,promo_id,channel_id);

CREATE TABLE COUNTRIES
( COUNTRY_ID numeric NOT NULL,
COUNTRY_ISO_CODE CHAR(2) NOT NULL,
COUNTRY_NAME VARCHAR(40) NOT NULL,
COUNTRY_SUBREGION VARCHAR(30) NOT NULL,
COUNTRY_SUBREGION_ID numeric NOT NULL,
COUNTRY_REGION VARCHAR(20) NOT NULL,
COUNTRY_REGION_ID numeric NOT NULL,
COUNTRY_TOTAL VARCHAR(11) NOT NULL,
COUNTRY_TOTAL_ID numeric NOT NULL,
COUNTRY_NAME_HIST numeric(40),
CONSTRAINT COUNTRIES_PK PRIMARY KEY (COUNTRY_ID))
distributed by (country_id);

CREATE TABLE CUSTOMERS
( CUST_ID numeric NOT NULL,
CUST_FIRST_NAME VARCHAR(20) NOT NULL,
CUST_LAST_NAME VARCHAR(40) NOT NULL,
CUST_GENDER CHAR(1) NOT NULL,
CUST_YEAR_OF_BIRTH numeric(4,0) NOT NULL,
CUST_MARITAL_STATUS VARCHAR(20),
CUST_STREET_ADDRESS VARCHAR(40) NOT NULL,
CUST_POSTAL_CODE VARCHAR(10) NOT NULL,
CUST_CITY VARCHAR(30) NOT NULL,
CUST_CITY_ID numeric NOT NULL,
CUST_STATE_PROVINCE VARCHAR(40) NOT NULL,
CUST_STATE_PROVINCE_ID numeric NOT NULL,
COUNTRY_ID numeric NOT NULL ,
CUST_MAIN_PHONE_NUMBER VARCHAR(25) NOT NULL,
CUST_INCOME_LEVEL VARCHAR(30),
CUST_CREDIT_LIMIT numeric,
CUST_EMAIL VARCHAR(30),
CUST_TOTAL VARCHAR(14) NOT NULL,
CUST_TOTAL_ID numeric NOT NULL,
CUST_SRC_ID numeric,
CUST_EFF_FROM DATE,
CUST_EFF_TO DATE,
CUST_VALID VARCHAR(1),
CONSTRAINT CUSTOMERS_PK PRIMARY KEY (CUST_ID))
distributed by (cust_id);

CREATE TABLE PRODUCTS
( PROD_ID numeric(6,0) NOT NULL,
PROD_NAME VARCHAR(50) NOT NULL,
PROD_DESC VARCHAR(4000) NOT NULL,
PROD_SUBCATEGORY VARCHAR(50) NOT NULL,
PROD_SUBCATEGORY_ID numeric NOT NULL,
PROD_SUBCATEGORY_DESC VARCHAR(2000) NOT NULL,
PROD_CATEGORY VARCHAR(50) NOT NULL,
PROD_CATEGORY_ID numeric NOT NULL,
PROD_CATEGORY_DESC VARCHAR(2000) NOT NULL ,
PROD_WEIGHT_CLASS numeric(3,0) NOT NULL ,
PROD_UNIT_OF_MEASURE VARCHAR(20),
PROD_PACK_SIZE VARCHAR(30 ) NOT NULL ,
SUPPLIER_ID numeric(6,0) NOT NULL ,
PROD_STATUS VARCHAR(20) NOT NULL,
PROD_LIST_PRICE numeric(8,2) NOT NULL ,
PROD_MIN_PRICE numeric(8,2) NOT NULL ,
PROD_TOTAL VARCHAR(13) NOT NULL ,
PROD_TOTAL_ID numeric NOT NULL ,
PROD_SRC_ID numeric,
PROD_EFF_FROM DATE,
PROD_EFF_TO DATE,
PROD_VALID VARCHAR(1),
CONSTRAINT PRODUCTS_PK PRIMARY KEY (PROD_ID))
distributed by (prod_id);

CREATE TABLE PROMOTIONS
(PROMO_ID numeric,
PROMO_NAME text ,
PROMO_SUBCATEGORY text ,
PROMO_SUBCATEGORY_ID numeric,
PROMO_CATEGORY text ,
PROMO_CATEGORY_ID numeric ,
PROMO_COST numeric,
PROMO_BEGIN_DATE date ,
PROMO_END_DATE date ,
PROMO_TOTAL text ,
PROMO_TOTAL_ID numeric )
distributed by (promo_id);

创建对应的外部表,访问导出CSV文件。使用GP的外部表,需要先把gpfdist这个工具(默认路径/usr/local/greenplum-db/bin)拷贝到存放csv文件的文件服务器上。启动gpfdist服务(类似一个http服务),比如文件在/stage/data目录的子目录下,我们可以启动它
nohup gpfdist -d /stage -p 8081 -l gpfdist.log &

另外,要允许用户创建外部表,非则建外部表时会得到错误
ERROR: permission denied: no privilege to create a readable gpfdist external table

修改配置文件,添加参数
gp_external_enable_exec = on # enable external tables with EXECUTE.
gp_external_grant_privileges = on #enable create http/gpfdist for non su's
允许非超级管理员创建外部表,必须重启数据库服务,才能生效。

下面是创建外部表的句法,可以使用*作为通配符指定特定目录下的多个文件,这些文件也可以分布到多个文件服务器上,以提高数据的加载效率。

create external table sales_ext
(PROD_ID numeric,
CUST_ID numeric,
TIME_ID date,
CHANNEL_ID numeric ,
PROMO_ID numeric ,
QUANTITY_SOLD numeric(10,2),
AMOUNT_SOLD numeric(10,2))
LOCATION ('gpfdist://elt1:8081/sales/*')
FORMAT 'CSV'
log errors into err_sales_ext segment reject limit 7000000 rows;

CREATE EXTERNAL TABLE TIMES_EXT
( TIME_ID TEXT ,
DAY_NAME VARCHAR(9),
DAY_NUMBER_IN_WEEK numeric(1,0),
DAY_NUMBER_IN_MONTH numeric(2,0),
CALENDAR_WEEK_NUMBER numeric(2,0),
FISCAL_WEEK_NUMBER numeric(2,0),
WEEK_ENDING_DAY TEXT,
WEEK_ENDING_DAY_ID numeric,
CALENDAR_MONTH_NUMBER numeric(2,0),
FISCAL_MONTH_NUMBER numeric(2,0),
CALENDAR_MONTH_DESC VARCHAR(8),
CALENDAR_MONTH_ID numeric,
FISCAL_MONTH_DESC VARCHAR(8),
FISCAL_MONTH_ID numeric,
DAYS_IN_CAL_MONTH numeric,
DAYS_IN_FIS_MONTH numeric,
END_OF_CAL_MONTH TEXT ,
END_OF_FIS_MONTH TEXT ,
CALENDAR_MONTH_NAME VARCHAR(9),
FISCAL_MONTH_NAME VARCHAR(9),
CALENDAR_QUARTER_DESC CHAR(7),
CALENDAR_QUARTER_ID numeric,
FISCAL_QUARTER_DESC CHAR(7),
FISCAL_QUARTER_ID numeric,
DAYS_IN_CAL_QUARTER numeric ,
DAYS_IN_FIS_QUARTER numeric ,
END_OF_CAL_QUARTER TEXT,
END_OF_FIS_QUARTER TEXT,
CALENDAR_QUARTER_NUMBER numeric(1,0),
FISCAL_QUARTER_NUMBER numeric(1,0),
CALENDAR_YEAR numeric(4,0),
CALENDAR_YEAR_ID numeric,
FISCAL_YEAR numeric(4,0),
FISCAL_YEAR_ID numeric,
DAYS_IN_CAL_YEAR numeric,
DAYS_IN_FIS_YEAR numeric,
END_OF_CAL_YEAR TEXT ,
END_OF_FIS_YEAR TEXT)
LOCATION ('gpfdist://elt1:8081/times/*')
FORMAT 'CSV'
log errors into err_times_ext segment reject limit 7000000 rows;

CREATE EXTERNAL TABLE CHANNELS_EXT
(CHANNEL_ID numeric ,
CHANNEL_DESC VARCHAR(20),
CHANNEL_CLASS VARCHAR(20),
CHANNEL_CLASS_ID numeric ,
CHANNEL_TOTAL VARCHAR(13) ,
CHANNEL_TOTAL_ID numeric)
LOCATION ('gpfdist://elt1:8081/channels/*')
FORMAT 'CSV'
log errors into err_channels_ext segment reject limit 10000 rows;

CREATE EXTERNAL TABLE COSTS_EXT
( PROD_ID numeric ,
TIME_ID TEXT ,
PROMO_ID numeric ,
CHANNEL_ID numeric ,
UNIT_COST numeric(10,2) ,
UNIT_PRICE numeric(10,2))
LOCATION ('gpfdist://elt1:8081/costs/*')
FORMAT 'CSV'
log errors into err_costs_ext segment reject limit 1000000 rows;

CREATE EXTERNAL TABLE COUNTRIES_EXT
( COUNTRY_ID numeric ,
COUNTRY_ISO_CODE CHAR(2) ,
COUNTRY_NAME VARCHAR(40),
COUNTRY_SUBREGION VARCHAR(30) ,
COUNTRY_SUBREGION_ID numeric ,
COUNTRY_REGION VARCHAR(20) ,
COUNTRY_REGION_ID numeric ,
COUNTRY_TOTAL VARCHAR(11),
COUNTRY_TOTAL_ID numeric,
COUNTRY_NAME_HIST numeric(40))
LOCATION ('gpfdist://elt1:8081/countries/*')
FORMAT 'CSV'
log errors into err_countries_ext segment reject limit 1000000 rows;

CREATE EXTERNAL TABLE CUSTOMERS_EXT
( CUST_ID numeric,
CUST_FIRST_NAME VARCHAR(20),
CUST_LAST_NAME VARCHAR(40) ,
CUST_GENDER CHAR(1) ,
CUST_YEAR_OF_BIRTH numeric(4,0) ,
CUST_MARITAL_STATUS VARCHAR(20),
CUST_STREET_ADDRESS VARCHAR(40) ,
CUST_POSTAL_CODE VARCHAR(10) ,
CUST_CITY VARCHAR(30) ,
CUST_CITY_ID numeric ,
CUST_STATE_PROVINCE VARCHAR(40) ,
CUST_STATE_PROVINCE_ID numeric ,
COUNTRY_ID numeric ,
CUST_MAIN_PHONE_NUMBER VARCHAR(25),
CUST_INCOME_LEVEL VARCHAR(30),
CUST_CREDIT_LIMIT numeric,
CUST_EMAIL VARCHAR(30),
CUST_TOTAL VARCHAR(14),
CUST_TOTAL_ID numeric ,
CUST_SRC_ID TEXT,
CUST_EFF_FROM TEXT,
CUST_EFF_TO TEXT,
CUST_VALID VARCHAR(1))
LOCATION ('gpfdist://elt1:8081/customers/*')
FORMAT 'CSV'
log errors into err_customers_ext segment reject limit 1000000 rows;

CREATE EXTERNAL TABLE PRODUCTS_EXT
( PROD_ID numeric(6,0),
PROD_NAME VARCHAR(50) ,
PROD_DESC VARCHAR(4000) ,
PROD_SUBCATEGORY VARCHAR(50) ,
PROD_SUBCATEGORY_ID numeric ,
PROD_SUBCATEGORY_DESC VARCHAR(2000) ,
PROD_CATEGORY VARCHAR(50) ,
PROD_CATEGORY_ID numeric ,
PROD_CATEGORY_DESC VARCHAR(2000) ,
PROD_WEIGHT_CLASS numeric(3,0) ,
PROD_UNIT_OF_MEASURE VARCHAR(20),
PROD_PACK_SIZE VARCHAR(30) ,
SUPPLIER_ID numeric(6,0) ,
PROD_STATUS VARCHAR(20) ,
PROD_LIST_PRICE numeric(8,2),
PROD_MIN_PRICE numeric(8,2) ,
PROD_TOTAL VARCHAR(13) ,
PROD_TOTAL_ID numeric ,
PROD_SRC_ID text,
PROD_EFF_FROM TEXT,
PROD_EFF_TO TEXT,
PROD_VALID VARCHAR(1))
LOCATION ('gpfdist://elt1:8081/products/*')
FORMAT 'CSV'
log errors into err_PRODUCTS_EXT segment reject limit 1000000 rows;

CREATE EXTERNAL TABLE PROMOTIONS_EXT
(PROMO_ID numeric,
PROMO_NAME text ,
PROMO_SUBCATEGORY text ,
PROMO_SUBCATEGORY_ID numeric,
PROMO_CATEGORY text ,
PROMO_CATEGORY_ID numeric ,
PROMO_COST numeric,
PROMO_BEGIN_DATE text ,
PROMO_END_DATE text ,
PROMO_TOTAL text ,
PROMO_TOTAL_ID numeric )
LOCATION ('gpfdist://elt1:8081/promotions/*')
FORMAT 'CSV'
log errors into err_promotions_ext segment reject limit 1000000 rows;

接下来就可以使用子查询加载数据了
sales_history=> insert into sales select * from sales_ext;
NOTICE: Found 70681 data formatting errors (70681 or more input rows). Rejected related input data.
INSERT 0 918843
sales_history=> select * from err_sales_ext;
sales_history=> select count(*) from sales;
count
--------
918843
(1 row)

insert into sales select * from sales_ext;
对于有问题的记录,加载过程中会抛到定义外部表的时指定的错误表中。
与其它数据库一样,在加载大量数据后,应该收集表的统计信息
sales_history=> vacuum analyze sales;
VACUUM
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐