NULL
ALTER TABLE ... REPLACE COLUMNS
|
|
|
|
|
|
|
|
|
。
|
CREATE TABLE
INSERT
INSERT ... SELECT
笔记:bzip2
pbzip2
fs.s3a.block.size
fs.s3a.block.size
fs.s3a.block.size
要使用文本数据文件创建表格:
CREATE TABLE
create table my_table(id int, s string, n int, t timestamp, b boolean);
INSERT
FIELDS TERMINATED BY
ROW FORMAT DELIMITED
STORED AS TEXTFILE
create table csv(id int, s string, n int, t timestamp, b boolean)
row format delimited
fields terminated by ',';
create table tsv(id int, s string, n int, t timestamp, b boolean)
row format delimited
fields terminated by '\t';
create table pipe_separated(id int, s string, n int, t timestamp, b boolean)
row format delimited
fields terminated by '|'
stored as textfile;
INSERT ... SELECT
'\0'
nul
create table nul_separated(id int, s string, n int, t timestamp, b boolean)
row format delimited
fields terminated by '\0'
stored as textfile;
笔记:CREATE TABLE
ESCAPED BY
DESCRIBE FORMATTED table_name
复杂类型注意事项:ARRAY
STRUCT
MAP
COUNT(*)
.tmp
.copying
.copying
.COPYING
.bz2
.deflate.gz
.snappy
.zst
CREATE EXTERNAL TABLE
LOAD DATA
INSERT
INSERT ... SELECT
SELECT
INSERT ... VALUES
INSERT ... VALUES
INSERT ... SELECT
文本数据文件中的特殊值:
inf
nan
FLOAT
DOUBLE
\N
NULL
--null-non-string
--null-string
NULL
\N
--null-string '\\N' --null-non-string '\\N'
NULL
null
ALTER TABLE name SET TBLPROPERTIES("serialization.null.format"="null")
skip.header.line.count
TBLPROPERTIES
create table header_line(first_name string, age int)
row format delimited fields terminated by ',';
-- Back in the shell, load data into the table with commands such as:
-- cat >data.csv
-- Name,Age
-- Alice,25
-- Bob,19
-- hdfs dfs -put data.csv /user/hive/warehouse/header_line
refresh header_line;
-- Initially, the Name,Age header line is treated as a row of the table.
select * from header_line limit 10;
+------------+------+
| first_name | age |
+------------+------+
| Name | NULL |
| Alice | 25 |
| Bob | 19 |
+------------+------+
alter table header_line set tblproperties('skip.header.line.count'='1');
-- Once the table property is set, queries skip the specified number of lines
-- at the beginning of each text data file. Therefore, all the files in the table
-- should follow the same convention for header lines.
select * from header_line limit 10;
+------------+-----+
| first_name | age |
+------------+-----+
| Alice | 25 |
| Bob | 19 |
+------------+-----+
LOAD DATA
LOAD DATA
-- Text table with default delimiter, the hex 01 character.
CREATE TABLE text_table AS SELECT * FROM other_file_format_table;
-- Text table with user-specified delimiter. Currently, you cannot specify
-- the delimiter as part of CREATE TABLE LIKE or CREATE TABLE AS SELECT.
-- But you can change an existing text table to have a different delimiter.
CREATE TABLE csv LIKE other_file_format_table;
ALTER TABLE csv SET SERDEPROPERTIES ('serialization.format'=',', 'field.delim'=',');
INSERT INTO csv SELECT * FROM other_file_format_table;
DESCRIBE FORMATTED
hdfs dfs -ls hdfs_directoryhdfs dfs -cat hdfs_file
INSERT ... VALUES
INSERT INTO text_table VALUES ('string_literal',100,hex('hello world'));
注意:INSERT ... VALUES
INSERT ... VALUES
LOAD DATA
INSERT ... SELECT
将 HBase 表INSERT
遇到
\N
NULL
NULL
NULL
NULL
hdfs dfs -put
hdfs dfs -cp
REFRESH table_name
CREATE TABLE
CREATE TABLE
INSERT
Hive wiki 上的 LZO 页面lzop
配置 Impala 以使用 LZO:
一个
笔记:hadoop-lzo
。
对于 RHEL/CentOS 系统:$ sudo yum update
$ sudo yum install hadoop-lzo
$ sudo yum install impala-lzo
对于 SUSE 系统:$ sudo apt-get update
$ sudo zypper install hadoop-lzo
$ sudo zypper install impala-lzo
对于 Debian/Ubuntu 系统:$ sudo zypper update
$ sudo apt-get install hadoop-lzo
$ sudo apt-get install impala-lzo
笔记:impala-lzo
impala-lzo
core-site.xml
和com.hadoop.compression.lzo.LzopCodec
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,
org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzopCodec</value>
</property>
笔记:
$ ls -l /etc/hadoop
total 8
lrwxrwxrwx. 1 root root 29 Feb 26 2013 conf -> /etc/alternatives/hadoop-conf
lrwxrwxrwx. 1 root root 10 Feb 26 2013 conf.dist -> conf.empty
drwxr-xr-x. 2 root root 4096 Feb 26 2013 conf.empty
drwxr-xr-x. 2 root root 4096 Oct 28 15:46 conf.pseudo
io.compression.codecs
com.hadoop.compression.lzo.LzopCodec
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
hive> SET mapreduce.output.fileoutputformat.compress=true;
hive> SET hive.exec.compress.output=true;
hive> SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
hive> CREATE TABLE lzo_t (s string) STORED AS
> INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
hive> INSERT INTO TABLE lzo_t SELECT col1, col2 FROM uncompressed_text_table;
INSERT ... SELECT
.lzo
INSERT
com.hadoop.compression.lzo.DistributedLzoIndexer
hadoop-lzo
$ hadoop jar /usr/lib/hadoop/lib/hadoop-lzo-version-gplextras.jar
com.hadoop.compression.lzo.DistributedLzoIndexer /hdfs_location_of_table/
注意:
.index
INVALIDATE METADATA
INVALIDATE METADATA
ROW FORMAT
LOAD DATA
CREATE EXTERNAL TABLE
LOCATION
create table csv_compressed (a string, b string, c string)
row format delimited fields terminated by ",";
insert into csv_compressed values
('one - uncompressed', 'two - uncompressed', 'three - uncompressed'),
('abc - uncompressed', 'xyz - uncompressed', '123 - uncompressed');
...make equivalent .bz2, .gz, .snappy, and .zst files and load them into same table directory...
select * from csv_compressed;
+--------------------+--------------------+----------------------+
| a | b | c |
+--------------------+--------------------+----------------------+
| one - snappy | two - snappy | three - snappy |
| one - uncompressed | two - uncompressed | three - uncompressed |
| abc - uncompressed | xyz - uncompressed | 123 - uncompressed |
| one - bz2 | two - bz2 | three - bz2 |
| abc - bz2 | xyz - bz2 | 123 - bz2 |
| one - gzip | two - gzip | three - gzip |
| abc - gzip | xyz - gzip | 123 - gzip |
| one - zstd | two - zstd | three - zstd |
| abc - zstd | xyz - zstd | 123 - zstd |
| one - deflate | two - deflate | three - deflate |
| abc - deflate | xyz - deflate | 123 - deflate |
+--------------------+--------------------+----------------------+
$ hdfs dfs -ls 'hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/';
...truncated for readability...
75 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed.snappy
79 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_bz2.csv.bz2
80 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_gzip.csv.gz
58 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_zstd.csv.zst
48 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_deflate.csv.deflate
116 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/dd414df64d67d49b_data.0.