Apache Kudu表
批量加载 全表扫描
替代存储引擎 实时更新 快速扫描
简化ETL管道
DELETE
UPDATE
UPSERT
PRIMARYKEY
LOADDATA
TRUNCATETABLE
INSERTOVERWRITE
-kudu_master_hosts
CREATETABLE...STOREDASKUDU
kudu_host:7051
-kudu_master_hosts
TBLPROPERTIES('kudu.master_addresses')
CREATETABLE
TBLPROPERTIES('kudu.master_addresses')
ALTERTABLE
CREATETABLE语句TBLPROPERTIES('kudu.num_tablet_replicas'='n')
kudu.num_tablet_replicas
CREATETABLE
ALTERTABLE
NULL
INSERT
INSERT
UPSERT
笔记:PRIMARYKEY
NOTNULL
CREATETABLE
PRIMARY KEY
| [NOT] NULL
| ENCODING codec
| COMPRESSION algorithm
| DEFAULT constant_expression
| BLOCK_SIZE number
PRIMARYKEY
NOTNULL
CREATETABLE
PRIMARYKEY
PRIMARYKEY(c1,c2,...)
PRIMARYKEY
CREATE TABLE pk_inline
(
col1 BIGINT PRIMARY KEY,
col2 STRING,
col3 BOOLEAN
) PARTITION BY HASH(col1) PARTITIONS 2 STORED AS KUDU;
CREATE TABLE pk_at_end
(
col1 BIGINT,
col2 STRING,
col3 BOOLEAN,
PRIMARY KEY (col1)
) PARTITION BY HASH(col1) PARTITIONS 2 STORED AS KUDU;
CREATE TABLE pk_multiple_columns
(
col1 BIGINT,
col2 STRING,
col3 BOOLEAN,
PRIMARY KEY (col1, col2)
) PARTITION BY HASH(col2) PARTITIONS 2 STORED AS KUDU;
SHOWCREATETABLE
PRIMARYKEY
CREATE TABLE inline_pk_rewritten (id BIGINT PRIMARY KEY, s STRING)
PARTITION BY HASH(id) PARTITIONS 2 STORED AS KUDU;
SHOW CREATE TABLE inline_pk_rewritten;
+------------------------------------------------------------------------------+
| result |
+------------------------------------------------------------------------------+
| CREATE TABLE user.inline_pk_rewritten ( |
| id BIGINT NOT NULL ENCODING AUTO_ENCODING COMPRESSION DEFAULT_COMPRESSION, |
| s STRING NULL ENCODING AUTO_ENCODING COMPRESSION DEFAULT_COMPRESSION, |
| PRIMARY KEY (id) |
| ) |
| PARTITION BY HASH (id) PARTITIONS 2 |
| STORED AS KUDU |
| TBLPROPERTIES ('kudu.master_addresses'='host.example.com') |
+------------------------------------------------------------------------------+
UPDATE
UPSERT
NOTNULL
NOTNULL
NULL
NULL
PRIMARYKEY
NOTNULL
NULL
CREATE TABLE required_columns
(
id BIGINT PRIMARY KEY,
latitude DOUBLE NOT NULL,
longitude DOUBLE NOT NULL,
place_name STRING,
altitude DOUBLE,
population BIGINT
) PARTITION BY HASH(id) PARTITIONS 2 STORED AS KUDU;
NOTNULL
NULL
NULL
NOTNULL
DEFAULT
NULL
'N/A'
DEFAULT
CREATE TABLE default_vals
(
id BIGINT PRIMARY KEY,
name STRING NOT NULL DEFAULT 'unknown',
address STRING DEFAULT upper('no fixed address'),
age INT DEFAULT -1,
earthling BOOLEAN DEFAULT TRUE,
planet_of_origin STRING DEFAULT 'Earth',
optional_col STRING DEFAULT NULL
) PARTITION BY HASH(id) PARTITIONS 2 STORED AS KUDU;
笔记:NULL
ISNULL
ISNOTNULL
DEFAULT
AUTO_ENCODING
PLAIN_ENCODING
RLE
DICT_ENCODING
BIT_SHUFFLE
PREFIX_ENCODING
DESCRIBE
ID
DEFAULT_ENCODING
CREATE TABLE various_encodings
(
id BIGINT PRIMARY KEY,
c1 BIGINT ENCODING PLAIN_ENCODING,
c2 BIGINT ENCODING AUTO_ENCODING,
c3 TINYINT ENCODING BIT_SHUFFLE,
c4 DOUBLE ENCODING BIT_SHUFFLE,
c5 BOOLEAN ENCODING RLE,
c6 STRING ENCODING DICT_ENCODING,
c7 STRING ENCODING PREFIX_ENCODING
) PARTITION BY HASH(id) PARTITIONS 2 STORED AS KUDU;
-- Some columns are omitted from the output for readability.
describe various_encodings;
+------+---------+-------------+----------+-----------------+
| name | type | primary_key | nullable | encoding |
+------+---------+-------------+----------+-----------------+
| id | bigint | true | false | AUTO_ENCODING |
| c1 | bigint | false | true | PLAIN_ENCODING |
| c2 | bigint | false | true | AUTO_ENCODING |
| c3 | tinyint | false | true | BIT_SHUFFLE |
| c4 | double | false | true | BIT_SHUFFLE |
| c5 | boolean | false | true | RLE |
| c6 | string | false | true | DICT_ENCODING |
| c7 | string | false | true | PREFIX_ENCODING |
+------+---------+-------------+----------+-----------------+
ENCODING
ENCODING
COMPRESSION
LZ4
SNAPPY
ZLIB
笔记:BITSHUFFLE
LZ4
COMPRESSION
STRING
ENCODING
COMPRESSION
country
post_id
body
COMPRESSION
CREATE TABLE blog_posts
(
user_id STRING ENCODING DICT_ENCODING,
post_id BIGINT ENCODING BIT_SHUFFLE,
subject STRING ENCODING PLAIN_ENCODING,
body STRING COMPRESSION LZ4,
spanish_translation STRING COMPRESSION SNAPPY,
esperanto_translation STRING COMPRESSION ZLIB,
PRIMARY KEY (user_id, post_id)
) PARTITION BY HASH(user_id, post_id) PARTITIONS 2 STORED AS KUDU;
BLOCK_SIZE
CREATETABLE
PARTITIONBY
HASH
RANGE
PARTITIONEDBY
笔记:DISTRIBUTEBY
PARTITIONBY
INTOnBUCKETSPARTITIONSnSPLITROWS
HASH
-- 1M rows with 50 hash partitions = approximately 20,000 rows per partition.
-- The values in each partition are not sequential, but rather based on a hash function.
-- Rows 1, 99999, and 123456 might be in the same partition.
CREATE TABLE million_rows (id string primary key, s string)
PARTITION BY HASH(id) PARTITIONS 50
STORED AS KUDU;
-- Because the ID values are unique, we expect the rows to be roughly
-- evenly distributed between the buckets in the destination table.
INSERT INTO million_rows SELECT * FROM billion_rows ORDER BY id LIMIT 1e6;
笔记:PARTITIONS
CREATETABLE
PARTITIONS2
RANGE
CREATETABLE
PARTITIONBY
VALUE
VALUES
SPLITROWS
-- 50 buckets, all for IDs beginning with a lowercase letter.
-- Having only a single range enforces the allowed range of values
-- but does not add any extra parallelism.
create table million_rows_one_range (id string primary key, s string)
partition by hash(id) partitions 50,
range (partition 'a' <= values < '{')
stored as kudu;
-- 50 buckets for IDs beginning with a lowercase letter
-- plus 50 buckets for IDs beginning with an uppercase letter.
-- Total number of buckets = number in the PARTITIONS clause x number of ranges.
-- We are still enforcing constraints on the primary key values
-- allowed in the table, and the 2 ranges provide better parallelism
-- as rows are inserted or the table is scanned.
create table million_rows_two_ranges (id string primary key, s string)
partition by hash(id) partitions 50,
range (partition 'a' <= values < '{', partition 'A' <= values < '[')
stored as kudu;
-- Same as previous table, with an extra range covering the single key value '00000'.
create table million_rows_three_ranges (id string primary key, s string)
partition by hash(id) partitions 50,
range (partition 'a' <= values < '{', partition 'A' <= values < '[', partition value = '00000')
stored as kudu;
-- The range partitioning can be displayed with a SHOW command in impala-shell.
show range partitions million_rows_three_ranges;
+---------------------+
| RANGE (id) |
+---------------------+
| VALUE = "00000" |
| "A" <= VALUES < "[" |
| "a" <= VALUES < "{" |
+---------------------+
笔记:"a"<=VALUES<"{"
z
za
zzz
zzz-ZZZz
INSERT
UPDATE
UPSERT
partition by range (year) (partition 1885 <= values <= 1889, partition 1893 <= values <= 1897)
partition by range (letter_grade) (partition value = 'A', partition value = 'B',
partition value = 'C', partition value = 'D', partition value = 'F')
ALTERTABLE
ADDPARTITION
DROPPARTITION
ALTER TABLE foo ADD PARTITION 30 <= VALUES < 50;
ALTER TABLE foo DROP PARTITION 1 <= VALUES < 5;
alter table test_scores add range partition value = 'E';
alter table year_ranges add range partition 1890 <= values < 1893;
alter table test_scores drop range partition value = 'E';
alter table year_ranges drop range partition 1890 <= values < 1893;
partition by hash (school) partitions 10,
range (letter_grade) (partition value = 'A', partition value = 'B',
partition value = 'C', partition value = 'D', partition value = 'F')
SHOWCREATETABLE
SHOWPARTITIONS
CREATETABLE
ALTERTABLE
SHOWTABLESTATS
SHOWPARTITIONS
TIMESTAMP
BIGINT
TIMESTAMP
TIMESTAMP
TIMESTAMP
BIGINT
TIMESTAMP
NULL
TIMESTAMP
ABORT_ON_ERROR
--- Make a table representing a date/time value as TIMESTAMP.
-- The strings representing the partition bounds are automatically
-- cast to TIMESTAMP values.
create table native_timestamp(id bigint, when_exactly timestamp, event string, primary key (id, when_exactly))
partition by hash (id) partitions 20,
range (when_exactly)
(
partition '2015-01-01' <= values < '2016-01-01',
partition '2016-01-01' <= values < '2017-01-01',
partition '2017-01-01' <= values < '2018-01-01'
)
stored as kudu;
insert into native_timestamp values (12345, now(), 'Working on doc examples');
select * from native_timestamp;
+-------+-------------------------------+-------------------------+
| id | when_exactly | event |
+-------+-------------------------------+-------------------------+
| 12345 | 2017-05-31 16:27:42.667542000 | Working on doc examples |
+-------+-------------------------------+-------------------------+
TIMESTAMP
BIGINT
CREATETABLE
int64
TIMESTAMP
STRING
unix_timestamp()
now()
TIMESTAMP
unix_timestamp()
TIMESTAMP
BIGINT
TIMESTAMP
-- now() returns a TIMESTAMP and shows the format for string literals you can cast to TIMESTAMP.
select now();
+-------------------------------+
| now() |
+-------------------------------+
| 2017-01-25 23:50:10.132385000 |
+-------------------------------+
-- unix_timestamp() accepts either a TIMESTAMP or an equivalent string literal.
select unix_timestamp(now());
+------------------+
| unix_timestamp() |
+------------------+
| 1485386670 |
+------------------+
select unix_timestamp('2017-01-01');
+------------------------------+
| unix_timestamp('2017-01-01') |
+------------------------------+
| 1483228800 |
+------------------------------+
-- Make a table representing a date/time value as BIGINT.
-- Construct 1 range partition and 20 associated hash partitions for each year.
-- Use date/time conversion functions to express the ranges as human-readable dates.
create table time_series(id bigint, when_exactly bigint, event string, primary key (id, when_exactly))
partition by hash (id) partitions 20,
range (when_exactly)
(
partition unix_timestamp('2015-01-01') <= values < unix_timestamp('2016-01-01'),
partition unix_timestamp('2016-01-01') <= values < unix_timestamp('2017-01-01'),
partition unix_timestamp('2017-01-01') <= values < unix_timestamp('2018-01-01')
)
stored as kudu;
-- On insert, we can transform a human-readable date/time into a numeric value.
insert into time_series values (12345, unix_timestamp('2017-01-25 23:24:56'), 'Working on doc examples');
-- On retrieval, we can examine the numeric date/time value or turn it back into a string for readability.
select id, when_exactly, from_unixtime(when_exactly) as 'human-readable date/time', event
from time_series order by when_exactly limit 100;
+-------+--------------+--------------------------+-------------------------+
| id | when_exactly | human-readable date/time | event |
+-------+--------------+--------------------------+-------------------------+
| 12345 | 1485386696 | 2017-01-25 23:24:56 | Working on doc examples |
+-------+--------------+--------------------------+-------------------------+
笔记:DECIMAL
-- 1 million and 1 microseconds = 1.000001 seconds.
select microseconds,
cast (microseconds as decimal(20,7)) / 1e6 as fractional_seconds
from table_with_microsecond_column;
+--------------+----------------------+
| microseconds | fractional_seconds |
+--------------+----------------------+
| 1000001 | 1.000001000000000000 |
+--------------+----------------------+
注意:
REFRESH
INVALIDATEMETADATA
REFRESHtable_nameINVALIDATEMETADATAtable_name
ALTERTABLE
impala::
create database some_database;
use some_database;
create table table_name_demo (x int primary key, y int)
partition by hash (x) partitions 2 stored as kudu;
describe formatted table_name_demo;
...
kudu.table_name | impala::some_database.table_name_demo
INVALIDATEMETADATA
INSERT
UPSERT
LOADDATA
INSERT
UPDATE
UPSERT
INSERT...SELECT
WHERE
UPDATE
DELETE
UPSERT
INSERT
UPDATE
INSERT
NOTNULL
INSERT
UPDATE
DELETE
UPDATE
INSERT...SELECT
SELECT
笔记:LOADDATA
INSERT
UPSERT
INSERT
INSERT
UPSERT
/*+NOCLUSTERED*/
/*+NOSHUFFLE*/
MEM_LIMIT
NOTNULL
INSERT
INSERT
PROFILE
ALL
SERVER
ALL
SERVER
kudu.master_addresses
CREATETABLE
SELECT
INSERT
DELETE
UPDATE
UPSERT
ALL
WHERE
HASH
col1
RANGE
col2
WHEREcol1IN(1,2,3)ANDcol2>100
column=expressionRUNTIME_FILTER_MODERUNTIME_FILTER_WAIT_TIME_MSDISABLE_ROW_RUNTIME_FILTERINGRUNTIME_BLOOM_FILTER_SIZERUNTIME_FILTER_MIN_SIZERUNTIME_FILTER_MAX_SIZE
MAX_NUM_RUNTIME_FILTERS
TABLESAMPLE
SELECT