Monday, June 22, 2020

Hive FileFormats


TextFile Format

hive (cchitra)> create table olympics(athelete String,age INT,county STRING, year STRING,closing STRING, sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as textfile;
OK
Time taken: 0.985 seconds

hive (cchitra)> show create table cchitra.olympics;
OK
CREATE TABLE `cchitra.olympics`(
  `athelete` string,
  `age` int,
  `county` string,
  `year` string,
  `closing` string,
  `sport` string,
  `gold` int,
  `silver` int,
  `bronze` int,
  `total` int)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics'
TBLPROPERTIES (
  'numFiles'='1',
  'numRows'='0',
  'rawDataSize'='0',
  'totalSize'='510053',
  'transient_lastDdlTime'='1592698122')
Time taken: 0.248 seconds, Fetched: 25 row(s)


hive (cchitra)> load data local inpath '/home/nareshjella/ChitraFolder/hive/olympix_data.csv' into table cchitra.olympics;
Loading data to table cchitra.olympics
Table cchitra.olympics stats: [numFiles=1, numRows=0, totalSize=510053, rawDataSize=0]
OK
Time taken: 1.832 seconds

hive (cchitra)> describe cchitra.olympics;
OK
athelete                string                                   
age                     int                                       
county                  string                                   
year                    string                                   
closing                 string                                   
sport                   string                                   
gold                    int                                       
silver                  int                                       
bronze                  int                                       
total                   int                                       
Time taken: 0.347 seconds, Fetched: 10 row(s)


hive (cchitra)> select * from cchitra.olympics limit 10;
OK
Michael Phelps  23      United States   2008    08-24-08        Swimming        8       0       0       8
Michael Phelps  19      United States   2004    08-29-04        Swimming        6       0       2       8
Michael Phelps  27      United States   2012    08-12-12        Swimming        4       2       0       6
Natalie Coughlin        25      United States   2008    08-24-08        Swimming        1       2       3       6
Aleksey Nemov   24      Russia  2000    10-01-00        Gymnastics      2       1       3       6
Alicia Coutts   24      Australia       2012    08-12-12        Swimming        1       3       1       5
Missy Franklin  17      United States   2012    08-12-12        Swimming        4       0       1       5
Ryan Lochte     27      United States   2012    08-12-12        Swimming        2       2       1       5
Allison Schmitt 22      United States   2012    08-12-12        Swimming        3       1       1       5
Natalie Coughlin        21      United States   2004    08-29-04        Swimming        2       2       1       5
Time taken: 0.654 seconds, Fetched: 10 row(s)


[nareshjella@gw02 hive]$ hadoop fs -ls /apps/hive/warehouse/cchitra.db/olympics/
Found 1 items
-rwxrwxrwx   2 nareshjella hdfs     510053 2020-06-20 20:08 /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv

[nareshjella@gw02 hive]$ hadoop fsck - /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv/ -files -blocks -locations
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Connecting to namenode via http://172.16.1.101:50070/fsck?ugi=nareshjella&files=1&blocks=1&locations=1&path=%2Fapps%2Fhive%2Fwarehouse%2Fcchitra.db%2Folympics%2Folympix_data.csv
FSCK started by nareshjella (auth:SIMPLE) from /172.16.1.109 for path /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv at Sat Jun 20 20:28:07 EDT 2020
/apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv 510053 bytes, 1 block(s):  OK
0. BP-292116404-172.16.1.101-1479167821718:blk_1109867799_36146486 len=510053 repl=2 [DatanodeInfoWithStorage[172.16.1.108:50010,DS-698dde50-a336-4e00-bc8f-a9e1a5cc76f4,DISK], DatanodeInfoWithStorage[172.16.1.104:50010,DS-f4667aac-0f2c-463c-9584-d625928b9af5,DISK]]

Status: HEALTHY
 Total size:    510053 B
 Total dirs:    0
 Total files:   1
 Total symlinks:                0
 Total blocks (validated):      1 (avg. block size 510053 B)
 Minimally replicated blocks:   1 (100.0 %)
 Over-replicated blocks:        0 (0.0 %)
 Under-replicated blocks:       0 (0.0 %)
 Mis-replicated blocks:         0 (0.0 %)
 Default replication factor:    2
 Average block replication:     2.0
 Corrupt blocks:                0
 Missing replicas:              0 (0.0 %)
 Number of data-nodes:          5
 Number of racks:               1
FSCK ended at Sat Jun 20 20:28:07 EDT 2020 in 1 milliseconds


The filesystem under path '/apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv' is HEALTHY

SequenceFile Format

create table olympics_sequence(athelete String,age INT,county STRING, year STRING,closing STRING, sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as sequencefile;
OK
Time taken: 0.282 seconds
scribe olympics_sequence;
OK
athelete                string                                   
age                     int                                       
county                  string                                   
year                    string                                   
closing                 string                                   
sport                   string                                   
gold                    int                                       
silver                  int                                       
bronze                  int                                       
total                   int                                       
Time taken: 0.37 seconds, Fetched: 10 row(s)
how create table cchitra.olympics_sequence;
OK
CREATE TABLE `cchitra.olympics_sequence`(
  `athelete` string,
  `age` int,
  `county` string,
  `year` string,
  `closing` string,
  `sport` string,
  `gold` int,
  `silver` int,
  `bronze` int,
  `total` int)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics_sequence'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='{\"BASIC_STATS\":\"true\"}',
  'numFiles'='1',
  'numRows'='8620',
  'rawDataSize'='501497',
  'totalSize'='619664',
  'transient_lastDdlTime'='1592699915')
Time taken: 0.161 seconds, Fetched: 26 row(s)

hive (cchitra)> insert overwrite table olympics_sequence
              > select * from olympics;
Query ID = nareshjella_20200620203816_485dc31f-55f6-4ce2-9834-f410f1886ec2
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1589064448439_16792, Tracking URL = http://rm01.itversity.com:19088/proxy/application_1589064448439_16792/
Kill Command = /usr/hdp/2.6.5.0-292/hadoop/bin/hadoop job  -kill job_1589064448439_16792
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2020-06-20 20:38:25,967 Stage-1 map = 0%,  reduce = 0%
2020-06-20 20:38:32,257 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.81 sec
MapReduce Total cumulative CPU time: 2 seconds 810 msec
Ended Job = job_1589064448439_16792
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to directory hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics_sequence/.hive-staging_hive_2020-06-20_20-38-16_923_2235400357686054193-1/-ext-10000
Loading data to table cchitra.olympics_sequence
Table cchitra.olympics_sequence stats: [numFiles=1, numRows=8620, totalSize=619664, rawDataSize=501497]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1   Cumulative CPU: 2.81 sec   HDFS Read: 515505 HDFS Write: 619751 SUCCESS
Total MapReduce CPU Time Spent: 2 seconds 810 msec
OK
Time taken: 18.569 seconds

hive (cchitra)> select * from olympics_sequence limit 10;
OK
Michael Phelps  23      United States   2008    08-24-08        Swimming        8       0       0       8
Michael Phelps  19      United States   2004    08-29-04        Swimming        6       0       2       8
Michael Phelps  27      United States   2012    08-12-12        Swimming        4       2       0       6
Natalie Coughlin        25      United States   2008    08-24-08        Swimming        1       2       3       6
Aleksey Nemov   24      Russia  2000    10-01-00        Gymnastics      2       1       3       6
Alicia Coutts   24      Australia       2012    08-12-12        Swimming        1       3       1       5
Missy Franklin  17      United States   2012    08-12-12        Swimming        4       0       1       5
Ryan Lochte     27      United States   2012    08-12-12        Swimming        2       2       1       5
Allison Schmitt 22      United States   2012    08-12-12        Swimming        3       1       1       5
Natalie Coughlin        21      United States   2004    08-29-04        Swimming        2       2       1       5
Time taken: 0.242 seconds, Fetched: 10 row(s)

nareshjella@gw02 hive]$ hadoop fs -ls /apps/hive/warehouse/cchitra.db/olympics_sequence
Found 1 items
-rwxrwxrwx   2 nareshjella hdfs     619664 2020-06-20 20:38 /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0

[nareshjella@gw02 hive]$ hadoop fsck - /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0/ -files -blocks -locations
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Connecting to namenode via http://172.16.1.101:50070/fsck?ugi=nareshjella&files=1&blocks=1&locations=1&path=%2Fapps%2Fhive%2Fwarehouse%2Fcchitra.db%2Folympics_sequence%2F000000_0
FSCK started by nareshjella (auth:SIMPLE) from /172.16.1.109 for path /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0 at Sat Jun 20 20:48:14 EDT 2020
/apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0 619664 bytes, 1 block(s):  OK
0. BP-292116404-172.16.1.101-1479167821718:blk_1109867871_36146558 len=619664 repl=2 [DatanodeInfoWithStorage[172.16.1.103:50010,DS-1f4edfab-2926-45f9-a37c-ae9d1f542680,DISK], DatanodeInfoWithStorage[172.16.1.102:50010,DS-1edb1d35-81bf-471b-be04-11d973e2a832,DISK]]

Status: HEALTHY
 Total size:    619664 B
 Total dirs:    0
 Total files:   1
 Total symlinks:                0
 Total blocks (validated):      1 (avg. block size 619664 B)
 Minimally replicated blocks:   1 (100.0 %)
 Over-replicated blocks:        0 (0.0 %)
 Under-replicated blocks:       0 (0.0 %)
 Mis-replicated blocks:         0 (0.0 %)
 Default replication factor:    2
 Average block replication:     2.0
 Corrupt blocks:                0
 Missing replicas:              0 (0.0 %)
 Number of data-nodes:          5
 Number of racks:               1
FSCK ended at Sat Jun 20 20:48:14 EDT 2020 in 0 milliseconds


The filesystem under path '/apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0' is HEALTHY







No comments:

Post a Comment