DataX下载安装
[toc]
# 01.Clickhouse到HBase(Phoenix)数据导入 DataX
# 介绍
DataX 是一个异构数据源离线同步工具,致力于实现包括关系型数据库(MySQL、Oracle等)、HDFS、Hive、ODPS、HBase、FTP等各种异构数据源之间稳定高效的数据同步功能。
# 下载
# 官方版本
wget http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz
# 编译好ClickHouse和Hbase2的版本
链接:https://pan.baidu.com/s/1IYU93oGOnvcx34HJaPDudQ
提取码:bool
1
2
3
4
5
6
2
3
4
5
6
解压
tar -zxvf datax.tar.gz
1
解压后目录结构如下
[root@dev-bigdata-24-146 datax]# ls
bin conf job lib log log_perf plugin script tmp
1
2
2
# 执行同步的组件
reader下是支持读的组件
writer下是支持写的组件
[root@dev-bigdata-24-146 datax]# tree -L 2 plugin/
plugin/
├── reader
│ ├── cassandrareader
│ ├── clickhousereader
│ ├── dbffilereader
│ ├── drdsreader
│ ├── elasticsearchreader
│ ├── ftpreader
│ ├── gaussdbreader
│ ├── gbasereader
│ ├── gdbreader
│ ├── hbase094xreader
│ ├── hbase11xkerberosreader
│ ├── hbase11xreader
│ ├── hbase11xsqlreader
│ ├── hbase20xsqlreader
│ ├── hdfsreader
│ ├── httpreader
│ ├── influxdbreader
│ ├── jsonfilereader
│ ├── kingbaseesreader
│ ├── kudureader
│ ├── mongodbreader
│ ├── mysql8reader
│ ├── mysqlreader
│ ├── odpsreader
│ ├── opentsdbreader
│ ├── oraclereader
│ ├── oscarreader
│ ├── ossreader
│ ├── otsreader
│ ├── otsstreamreader
│ ├── postgresqlreader
│ ├── rdbmsreader
│ ├── redisreader
│ ├── sqlserverreader
│ ├── streamreader
│ ├── tdenginereader
│ └── txtfilereader
└── writer
├── adbpgwriter
├── adswriter
├── cassandrawriter
├── clickhousewriter
├── dbffilewriter
├── dorisdbwriter
├── doriswriter
├── drdswriter
├── elasticsearchwriter
├── ftpwriter
├── gaussdbwriter
├── gbasewriter
├── gdbwriter
├── greenplumwriter
├── hbase094xwriter
├── hbase11xkerberoswriter
├── hbase11xsqlwriter
├── hbase11xwriter
├── hbase20xsqlwriter
├── hdfswriter
├── influxdbwriter
├── kingbaseeswriter
├── kuduwriter
├── mongodbwriter
├── mysql8writer
├── mysqlwriter
├── ocswriter
├── odpswriter
├── oraclewriter
├── oscarwriter
├── osswriter
├── otswriter
├── postgresqlwriter
├── rdbmswriter
├── rediswriter
├── sqlserverwriter
├── streamwriter
├── tdenginewriter
├── tsdbwriter
└── txtfilewriter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# 配置数据同步
本文介绍使用的是CDH6.3.2所带的之间HBase 版本2.1.0版本,需要安装Phoenix
Phoenix安装方法参考:https://datamining.blog.csdn.net/article/details/105572156 (opens new window)
# 查看官方读写配置样例
命令:python bin/datax.py -r clickhousereader -w hbase20xsqlwriter
[root@jast datax]# python bin/datax.py -r clickhousereader -w hbase20xsqlwriter
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
Please refer to the clickhousereader document:
https://github.com/alibaba/DataX/blob/master/clickhousereader/doc/clickhousereader.md
Please refer to the hbase20xsqlwriter document:
https://github.com/alibaba/DataX/blob/master/hbase20xsqlwriter/doc/hbase20xsqlwriter.md
Please save the following configuration as a json file and use
python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.
{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [],
"connection": [
{
"jdbcUrl": [],
"table": []
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "hbase20xsqlwriter",
"parameter": {
"batchSize": "100",
"column": [],
"nullMode": "skip",
"queryServerAddress": "",
"schema": "",
"serialization": "PROTOBUF",
"table": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 创建Hbase和Phoenix表
DataX中Hbase2是通过Phoenix插入,所以也要同步创建Phoenix表
Hbase表创建
create 'test_datax',{NAME=>'cf'}
1
Phoenix表创建
create table "test_datax"("rowkey" varchar primary key, "cf"."data_id" varchar , "cf"."user_id" varchar, "cf"."name" varchar, "cf"."phone" varchar, "cf"."pt" varchar) column_encoded_bytes=0;
1
# 创建ClickHouse表
create table test_datax_ck
(
data_id String,
user_id Nullable(String),
name Nullable(String),
phone Nullable(String),
pt String
)
engine = MergeTree PARTITION BY pt
ORDER BY (pt)
SETTINGS index_granularity = 8192;
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# 写入ClickHouse测试数据
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('1','1','张三1',13577665544,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('2','2','张三2',13577665546,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('3','3','张三3',13577665545,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('4','4','张三4',13577665543,'20210102');
1
2
3
4
2
3
4
# 编写ClickHouse2Hbase配置文件
创建配置文件vim ck2hbase.json
内容如下
{
"job":{
"content":[
{
"reader":{
"name":"clickhousereader",
"parameter":{
"column":[
"data_id",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"connection":[
{
"jdbcUrl":[
"jdbc:clickhouse://172.16.24.150:8123/ms_db?socket_timeout=7200000"
],
"table":[
"test_datax_ck"
]
}
],
"password":"default",
"username":"default",
"where":""
}
},
"writer":{
"name":"hbase20xsqlwriter",
"parameter":{
"batchSize":"100",
"column":[
"rowkey",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"nullMode":"skip",
"queryServerAddress":"http://172.16.24.146:8765",
"serialization":"PROTOBUF",
"table":"test_datax"
}
}
}
],
"setting":{
"speed":{
"channel":1
}
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 执行同步命令
python bin/datax.py ck2hbase.json
2022-04-29 15:31:51.894 [job-0] INFO JobContainer -
任务启动时刻 : 2022-04-29 15:31:48
任务结束时刻 : 2022-04-29 15:31:51
任务总计耗时 : 2s
任务平均流量 : 176B/s
记录写入速度 : 12rec/s
读出记录总数 : 12
读写失败总数 : 0
1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
https://blog.csdn.net/BOOLTechnology/article/details/120534070
# 拓展
# ClickHouse同步到MySQL配置文件
{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:clickhouse://172.16.24.171:8123/default?socket_timeout=7200000"],
"table": ["ads_user_profile_test"]
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"print": true,
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://172.16.24.143:3306/test"],
"table": ["test20220429"]
}
],
"password": "root",
"preSql": [],
"session": [],
"username": "root",
"writeMode": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 插件开发
https://blog.csdn.net/shujuelin/article/details/120725121
上次更新: 2023/05/11, 16:05:55