JSON、CSV TSV
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
2013-08-31T01:02:33Z,"Gypsy Danger","en","nuclear","true","true","false","false","article","North America","United States","Bay Area","San Francisco",57,200,-143
2013-08-31T03:32:45Z,"Striker Eureka","en","speed","false","true","true","false","wikipedia","Australia","Australia","Cantebury","Syndey",459,129,330
2013-08-31T07:11:21Z,"Cherno Alpha","ru","masterYi","false","true","true","false","article","Asia","Russia","Oblast","Moscow",123,12,111
2013-08-31T11:58:39Z,"Crimson Typhoon","zh","triplets","true","false","true","false","wikipedia","Asia","China","Shanxi","Taiyuan",905,5,900
2013-08-31T12:41:27Z,"Coyote Tango","ja","cancer","true","false","true","false","wikipedia","Asia","Japan","Kanto","Tokyo",1,10,-9
2013-08-31T01:02:33Z "Gypsy Danger" "en" "nuclear" "true" "true" "false" "false" "article" "North America" "United States" "Bay Area" "San Francisco" 57 200 -143
2013-08-31T03:32:45Z "Striker Eureka" "en" "speed" "false" "true" "true" "false" "wikipedia" "Australia" "Australia" "Cantebury" "Syndey" 459 129 330
2013-08-31T07:11:21Z "Cherno Alpha" "ru" "masterYi" "false" "true" "true" "false" "article" "Asia" "Russia" "Oblast" "Moscow" 123 12 111
2013-08-31T11:58:39Z "Crimson Typhoon" "zh" "triplets" "true" "false" "true" "false" "wikipedia" "Asia" "China" "Shanxi" "Taiyuan" 905 5 900
2013-08-31T12:41:27Z "Coyote Tango" "ja" "cancer" "true" "false" "true" "false" "wikipedia" "Asia" "Japan" "Kanto" "Tokyo" 1 10 -9
二进制格式
Regex解析器 JavaScript解析器
"ioConfig": {
"inputFormat": {
"type": "json"
},
...
}
字段 |
类型 |
描述 |
是否必填 |
|
|
|
|
|
|
|
|
|
|
|
|
"ioConfig": {
"inputFormat": {
"type": "csv",
"columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"]
},
...
}
字段 |
类型 |
描述 |
是否必填 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"ioConfig": {
"inputFormat": {
"type": "tsv",
"columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"],
"delimiter":"|"
},
...
}
字段 |
类型 |
描述 |
是否必填 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"ioConfig": {
"inputFormat": {
"type": "orc",
"flattenSpec": {
"useFieldDiscovery": true,
"fields": [
{
"type": "path",
"name": "nested",
"expr": "$.path.to.nested"
}
]
}
"binaryAsString": false
},
...
}
字段 |
类型 |
描述 |
是否必填 |
|
|
|
|
|
|
|
|
|
|
|
|
"ioConfig": {
"inputFormat": {
"type": "parquet",
"flattenSpec": {
"useFieldDiscovery": true,
"fields": [
{
"type": "path",
"name": "nested",
"expr": "$.path.to.nested"
}
]
}
"binaryAsString": false
},
...
}
字段 |
类型 |
描述 |
是否必填 |
|
|
|
|
|
|
|
|
|
|
|
|
"flattenSpec": {
"useFieldDiscovery": true,
"fields": [
{ "name": "baz", "type": "root" },
{ "name": "foo_bar", "type": "path", "expr": "$.foo.bar" },
{ "name": "first_food", "type": "jq", "expr": ".thing.food[1]" }
]
}
|
|
|
|
|
|
|
|
|