Monday, June 22, 2020

Hive FileFormats


TextFile Format

hive (cchitra)> create table olympics(athelete String,age INT,county STRING, year STRING,closing STRING, sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as textfile;
OK
Time taken: 0.985 seconds

hive (cchitra)> show create table cchitra.olympics;
OK
CREATE TABLE `cchitra.olympics`(
  `athelete` string,
  `age` int,
  `county` string,
  `year` string,
  `closing` string,
  `sport` string,
  `gold` int,
  `silver` int,
  `bronze` int,
  `total` int)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics'
TBLPROPERTIES (
  'numFiles'='1',
  'numRows'='0',
  'rawDataSize'='0',
  'totalSize'='510053',
  'transient_lastDdlTime'='1592698122')
Time taken: 0.248 seconds, Fetched: 25 row(s)


hive (cchitra)> load data local inpath '/home/nareshjella/ChitraFolder/hive/olympix_data.csv' into table cchitra.olympics;
Loading data to table cchitra.olympics
Table cchitra.olympics stats: [numFiles=1, numRows=0, totalSize=510053, rawDataSize=0]
OK
Time taken: 1.832 seconds

hive (cchitra)> describe cchitra.olympics;
OK
athelete                string                                   
age                     int                                       
county                  string                                   
year                    string                                   
closing                 string                                   
sport                   string                                   
gold                    int                                       
silver                  int                                       
bronze                  int                                       
total                   int                                       
Time taken: 0.347 seconds, Fetched: 10 row(s)


hive (cchitra)> select * from cchitra.olympics limit 10;
OK
Michael Phelps  23      United States   2008    08-24-08        Swimming        8       0       0       8
Michael Phelps  19      United States   2004    08-29-04        Swimming        6       0       2       8
Michael Phelps  27      United States   2012    08-12-12        Swimming        4       2       0       6
Natalie Coughlin        25      United States   2008    08-24-08        Swimming        1       2       3       6
Aleksey Nemov   24      Russia  2000    10-01-00        Gymnastics      2       1       3       6
Alicia Coutts   24      Australia       2012    08-12-12        Swimming        1       3       1       5
Missy Franklin  17      United States   2012    08-12-12        Swimming        4       0       1       5
Ryan Lochte     27      United States   2012    08-12-12        Swimming        2       2       1       5
Allison Schmitt 22      United States   2012    08-12-12        Swimming        3       1       1       5
Natalie Coughlin        21      United States   2004    08-29-04        Swimming        2       2       1       5
Time taken: 0.654 seconds, Fetched: 10 row(s)


[nareshjella@gw02 hive]$ hadoop fs -ls /apps/hive/warehouse/cchitra.db/olympics/
Found 1 items
-rwxrwxrwx   2 nareshjella hdfs     510053 2020-06-20 20:08 /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv

[nareshjella@gw02 hive]$ hadoop fsck - /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv/ -files -blocks -locations
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Connecting to namenode via http://172.16.1.101:50070/fsck?ugi=nareshjella&files=1&blocks=1&locations=1&path=%2Fapps%2Fhive%2Fwarehouse%2Fcchitra.db%2Folympics%2Folympix_data.csv
FSCK started by nareshjella (auth:SIMPLE) from /172.16.1.109 for path /apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv at Sat Jun 20 20:28:07 EDT 2020
/apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv 510053 bytes, 1 block(s):  OK
0. BP-292116404-172.16.1.101-1479167821718:blk_1109867799_36146486 len=510053 repl=2 [DatanodeInfoWithStorage[172.16.1.108:50010,DS-698dde50-a336-4e00-bc8f-a9e1a5cc76f4,DISK], DatanodeInfoWithStorage[172.16.1.104:50010,DS-f4667aac-0f2c-463c-9584-d625928b9af5,DISK]]

Status: HEALTHY
 Total size:    510053 B
 Total dirs:    0
 Total files:   1
 Total symlinks:                0
 Total blocks (validated):      1 (avg. block size 510053 B)
 Minimally replicated blocks:   1 (100.0 %)
 Over-replicated blocks:        0 (0.0 %)
 Under-replicated blocks:       0 (0.0 %)
 Mis-replicated blocks:         0 (0.0 %)
 Default replication factor:    2
 Average block replication:     2.0
 Corrupt blocks:                0
 Missing replicas:              0 (0.0 %)
 Number of data-nodes:          5
 Number of racks:               1
FSCK ended at Sat Jun 20 20:28:07 EDT 2020 in 1 milliseconds


The filesystem under path '/apps/hive/warehouse/cchitra.db/olympics/olympix_data.csv' is HEALTHY

SequenceFile Format

create table olympics_sequence(athelete String,age INT,county STRING, year STRING,closing STRING, sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as sequencefile;
OK
Time taken: 0.282 seconds
scribe olympics_sequence;
OK
athelete                string                                   
age                     int                                       
county                  string                                   
year                    string                                   
closing                 string                                   
sport                   string                                   
gold                    int                                       
silver                  int                                       
bronze                  int                                       
total                   int                                       
Time taken: 0.37 seconds, Fetched: 10 row(s)
how create table cchitra.olympics_sequence;
OK
CREATE TABLE `cchitra.olympics_sequence`(
  `athelete` string,
  `age` int,
  `county` string,
  `year` string,
  `closing` string,
  `sport` string,
  `gold` int,
  `silver` int,
  `bronze` int,
  `total` int)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics_sequence'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='{\"BASIC_STATS\":\"true\"}',
  'numFiles'='1',
  'numRows'='8620',
  'rawDataSize'='501497',
  'totalSize'='619664',
  'transient_lastDdlTime'='1592699915')
Time taken: 0.161 seconds, Fetched: 26 row(s)

hive (cchitra)> insert overwrite table olympics_sequence
              > select * from olympics;
Query ID = nareshjella_20200620203816_485dc31f-55f6-4ce2-9834-f410f1886ec2
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1589064448439_16792, Tracking URL = http://rm01.itversity.com:19088/proxy/application_1589064448439_16792/
Kill Command = /usr/hdp/2.6.5.0-292/hadoop/bin/hadoop job  -kill job_1589064448439_16792
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2020-06-20 20:38:25,967 Stage-1 map = 0%,  reduce = 0%
2020-06-20 20:38:32,257 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.81 sec
MapReduce Total cumulative CPU time: 2 seconds 810 msec
Ended Job = job_1589064448439_16792
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to directory hdfs://nn01.itversity.com:8020/apps/hive/warehouse/cchitra.db/olympics_sequence/.hive-staging_hive_2020-06-20_20-38-16_923_2235400357686054193-1/-ext-10000
Loading data to table cchitra.olympics_sequence
Table cchitra.olympics_sequence stats: [numFiles=1, numRows=8620, totalSize=619664, rawDataSize=501497]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1   Cumulative CPU: 2.81 sec   HDFS Read: 515505 HDFS Write: 619751 SUCCESS
Total MapReduce CPU Time Spent: 2 seconds 810 msec
OK
Time taken: 18.569 seconds

hive (cchitra)> select * from olympics_sequence limit 10;
OK
Michael Phelps  23      United States   2008    08-24-08        Swimming        8       0       0       8
Michael Phelps  19      United States   2004    08-29-04        Swimming        6       0       2       8
Michael Phelps  27      United States   2012    08-12-12        Swimming        4       2       0       6
Natalie Coughlin        25      United States   2008    08-24-08        Swimming        1       2       3       6
Aleksey Nemov   24      Russia  2000    10-01-00        Gymnastics      2       1       3       6
Alicia Coutts   24      Australia       2012    08-12-12        Swimming        1       3       1       5
Missy Franklin  17      United States   2012    08-12-12        Swimming        4       0       1       5
Ryan Lochte     27      United States   2012    08-12-12        Swimming        2       2       1       5
Allison Schmitt 22      United States   2012    08-12-12        Swimming        3       1       1       5
Natalie Coughlin        21      United States   2004    08-29-04        Swimming        2       2       1       5
Time taken: 0.242 seconds, Fetched: 10 row(s)

nareshjella@gw02 hive]$ hadoop fs -ls /apps/hive/warehouse/cchitra.db/olympics_sequence
Found 1 items
-rwxrwxrwx   2 nareshjella hdfs     619664 2020-06-20 20:38 /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0

[nareshjella@gw02 hive]$ hadoop fsck - /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0/ -files -blocks -locations
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Connecting to namenode via http://172.16.1.101:50070/fsck?ugi=nareshjella&files=1&blocks=1&locations=1&path=%2Fapps%2Fhive%2Fwarehouse%2Fcchitra.db%2Folympics_sequence%2F000000_0
FSCK started by nareshjella (auth:SIMPLE) from /172.16.1.109 for path /apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0 at Sat Jun 20 20:48:14 EDT 2020
/apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0 619664 bytes, 1 block(s):  OK
0. BP-292116404-172.16.1.101-1479167821718:blk_1109867871_36146558 len=619664 repl=2 [DatanodeInfoWithStorage[172.16.1.103:50010,DS-1f4edfab-2926-45f9-a37c-ae9d1f542680,DISK], DatanodeInfoWithStorage[172.16.1.102:50010,DS-1edb1d35-81bf-471b-be04-11d973e2a832,DISK]]

Status: HEALTHY
 Total size:    619664 B
 Total dirs:    0
 Total files:   1
 Total symlinks:                0
 Total blocks (validated):      1 (avg. block size 619664 B)
 Minimally replicated blocks:   1 (100.0 %)
 Over-replicated blocks:        0 (0.0 %)
 Under-replicated blocks:       0 (0.0 %)
 Mis-replicated blocks:         0 (0.0 %)
 Default replication factor:    2
 Average block replication:     2.0
 Corrupt blocks:                0
 Missing replicas:              0 (0.0 %)
 Number of data-nodes:          5
 Number of racks:               1
FSCK ended at Sat Jun 20 20:48:14 EDT 2020 in 0 milliseconds


The filesystem under path '/apps/hive/warehouse/cchitra.db/olympics_sequence/000000_0' is HEALTHY







Thursday, June 11, 2020




[e023586@sandbox-hdp ~]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://sandbox-hdp.hortonworks.com:4040
Spark context available as 'sc' (master = yarn, app id = application_1591918179668_0006).
Spark session available as 'spark'.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.3.1.3.0.1.0-187
      /_/

Using Scala version 2.11.8 (OpenJDK 64-Bit Server VM, Java 1.8.0_191)
Type in expressions to have them evaluated.
Type :help for more information.

scala> :paste
// Entering paste mode (ctrl-D to finish)

val df = spark.read.options(Map("header" -> "true",
"inferSchema" -> "true",
"nullValue" -> "NA",
"timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"mode" -> "failfast")
).csv("/user/e023586/notebook/spark/survey.csv")

// Exiting paste mode, now interpreting.

df: org.apache.spark.sql.DataFrame = [Timestamp: timestamp, Age: bigint ... 25 more fields]

scala>

scala>

scala>

scala>

scala> :paste
// Entering paste mode (ctrl-D to finish)

val df = spark.read
.format("csv")
.option("header","true")
.option("inferSchema","true")
.option("nullValue","NA")
.option("timestampFormat","yyyy-MM-dd'T'HH:mm:ss")
.option("mode","failfast")
.option("path","/user/e023586/notebook/spark/survey.csv")
.load()

// Exiting paste mode, now interpreting.

df: org.apache.spark.sql.DataFrame = [Timestamp: timestamp, Age: bigint ... 25 more fields]

scala> df.rdd.getNumPartitions
res0: Int = 1

scala> val df3 = df.repartition(3).toDF
df3: org.apache.spark.sql.DataFrame = [Timestamp: timestamp, Age: bigint ... 25 more fields]

scala> df3.rdd.getNumPartitions
res1: Int = 3

scala> df.select("Timestamp","Age","remote_work","leave").filter("Age >30").show
20/06/12 02:18:04 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
+-------------------+---+-----------+------------------+
|          Timestamp|Age|remote_work|             leave|
+-------------------+---+-----------+------------------+
|2014-08-27 11:29:31| 37|         No|     Somewhat easy|
|2014-08-27 11:29:37| 44|         No|        Don't know|
|2014-08-27 11:29:44| 32|         No|Somewhat difficult|
|2014-08-27 11:29:46| 31|         No|Somewhat difficult|
|2014-08-27 11:30:22| 31|        Yes|        Don't know|
|2014-08-27 11:31:22| 33|         No|        Don't know|
|2014-08-27 11:31:50| 35|        Yes|Somewhat difficult|
|2014-08-27 11:32:05| 39|        Yes|        Don't know|
|2014-08-27 11:32:39| 42|         No|    Very difficult|
|2014-08-27 11:32:44| 31|        Yes|        Don't know|
|2014-08-27 11:33:23| 42|         No|Somewhat difficult|
|2014-08-27 11:33:26| 36|         No|        Don't know|
|2014-08-27 11:34:37| 32|         No|        Don't know|
|2014-08-27 11:34:53| 46|        Yes|         Very easy|
|2014-08-27 11:35:08| 36|        Yes|     Somewhat easy|
|2014-08-27 11:35:24| 31|        Yes|Somewhat difficult|
|2014-08-27 11:35:48| 46|        Yes|        Don't know|
|2014-08-27 11:36:24| 41|         No|        Don't know|
|2014-08-27 11:36:48| 33|         No|        Don't know|
|2014-08-27 11:37:08| 35|         No|         Very easy|
+-------------------+---+-----------+------------------+
only showing top 20 rows


scala> df.printSchema
root
 |-- Timestamp: timestamp (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- self_employed: string (nullable = true)
 |-- family_history: string (nullable = true)
 |-- treatment: string (nullable = true)
 |-- work_interfere: string (nullable = true)
 |-- no_employees: string (nullable = true)
 |-- remote_work: string (nullable = true)
 |-- tech_company: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- care_options: string (nullable = true)
 |-- wellness_program: string (nullable = true)
 |-- seek_help: string (nullable = true)
 |-- anonymity: string (nullable = true)
 |-- leave: string (nullable = true)
 |-- mental_health_consequence: string (nullable = true)
 |-- phys_health_consequence: string (nullable = true)
 |-- coworkers: string (nullable = true)
 |-- supervisor: string (nullable = true)
 |-- mental_health_interview: string (nullable = true)
 |-- phys_health_interview: string (nullable = true)
 |-- mental_vs_physical: string (nullable = true)
 |-- obs_consequence: string (nullable = true)
 |-- comments: string (nullable = true)


scala>












[nareshjella@gw02 ~]$
[nareshjella@gw02 ~]$ spark-shell --packages com.databricks:spark-csv_2.10:1.5.0
Multiple versions of Spark are installed but SPARK_MAJOR_VERSION is not set
Spark1 will be picked by default
Ivy Default Cache set to: /home/nareshjella/.ivy2/cache
The jars for the packages stored in: /home/nareshjella/.ivy2/jars
:: loading settings :: url = jar:file:/usr/hdp/2.6.5.0-292/spark/lib/spark-assembly-1.6.3.2.6.5.0-292-hadoop2.7.3.2.6.5.0-292.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.databricks#spark-csv_2.10 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent;1.0
        confs: [default]
        found com.databricks#spark-csv_2.10;1.5.0 in central
        found org.apache.commons#commons-csv;1.1 in central
        found com.univocity#univocity-parsers;1.5.1 in central
downloading https://repo1.maven.org/maven2/com/databricks/spark-csv_2.10/1.5.0/spark-csv_2.10-1.5.0.jar ...
        [SUCCESSFUL ] com.databricks#spark-csv_2.10;1.5.0!spark-csv_2.10.jar (36ms)
downloading https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar ...
        [SUCCESSFUL ] org.apache.commons#commons-csv;1.1!commons-csv.jar (14ms)
downloading https://repo1.maven.org/maven2/com/univocity/univocity-parsers/1.5.1/univocity-parsers-1.5.1.jar ...
        [SUCCESSFUL ] com.univocity#univocity-parsers;1.5.1!univocity-parsers.jar (28ms)
:: resolution report :: resolve 1500ms :: artifacts dl 82ms
        :: modules in use:
        com.databricks#spark-csv_2.10;1.5.0 from central in [default]
        com.univocity#univocity-parsers;1.5.1 from central in [default]
        org.apache.commons#commons-csv;1.1 from central in [default]
        ---------------------------------------------------------------------
        |                  |            modules            ||   artifacts   |
        |       conf       | number| search|dwnlded|evicted|| number|dwnlded|
        ---------------------------------------------------------------------
        |      default     |   3   |   3   |   3   |   0   ||   3   |   3   |
        ---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent
        confs: [default]
        3 artifacts copied, 0 already retrieved (342kB/6ms)
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.6.3
      /_/

Using Scala version 2.10.5 (OpenJDK 64-Bit Server VM, Java 1.8.0_222)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.

scala> import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLContext

scala> val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/home/nareshjella/notebook/spark/survey.csv")
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://nn01.itversity.com:8020/home/nareshjella/notebook/spark/survey.csv
        at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
        at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
        at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
        at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
        at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
        at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
        at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1314)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
        at org.apache.spark.rdd.RDD.take(RDD.scala:1309)
        at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1349)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
        at org.apache.spark.rdd.RDD.first(RDD.scala:1348)
        at com.databricks.spark.csv.CsvRelation.firstLine$lzycompute(CsvRelation.scala:269)
        at com.databricks.spark.csv.CsvRelation.firstLine(CsvRelation.scala:265)
        at com.databricks.spark.csv.CsvRelation.inferSchema(CsvRelation.scala:242)
        at com.databricks.spark.csv.CsvRelation.<init>(CsvRelation.scala:74)
        at com.databricks.spark.csv.DefaultSource.createRelation(DefaultSource.scala:171)
        at com.databricks.spark.csv.DefaultSource.createRelation(DefaultSource.scala:44)
        at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158)
        at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119)
        at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:31)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:33)
        at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35)
        at $iwC$$iwC$$iwC$$iwC.<init>(<console>:37)
        at $iwC$$iwC$$iwC.<init>(<console>:39)
        at $iwC$$iwC.<init>(<console>:41)
        at $iwC.<init>(<console>:43)
        at <init>(<console>:45)
        at .<init>(<console>:49)
        at .<clinit>(<console>)
        at .<init>(<console>:7)
        at .<clinit>(<console>)
        at $print(<console>)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
        at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
        at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
        at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
        at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
        at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
        at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
        at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
        at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
        at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
        at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
        at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
        at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
        at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
        at org.apache.spark.repl.Main$.main(Main.scala:31)
        at org.apache.spark.repl.Main.main(Main.scala)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:750)
        at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
        at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)


scala> val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/user/nareshjella/notebook/spark/survey.csv")
df: org.apache.spark.sql.DataFrame = [Timestamp: timestamp, Age: bigint, Gender: string, Country: string, state: string, self_employed: string, family_history: string, treatment: string, work_interfere: string, no_employees: string, remote_work: string, tech_company: string, benefits: string, care_options: string, wellness_program: string, seek_help: string, anonymity: string, leave: string, mental_health_consequence: string, phys_health_consequence: string, coworkers: string, supervisor: string, mental_health_interview: string, phys_health_interview: string, mental_vs_physical: string, obs_consequence: string, comments: string]

scala> df.select("Timestamp","Age","remote_work","leave").filter("Age >30").show
+--------------------+---+-----------+------------------+
|           Timestamp|Age|remote_work|             leave|
+--------------------+---+-----------+------------------+
|2014-08-27 11:29:...| 37|         No|     Somewhat easy|
|2014-08-27 11:29:...| 44|         No|        Don't know|
|2014-08-27 11:29:...| 32|         No|Somewhat difficult|
|2014-08-27 11:29:...| 31|         No|Somewhat difficult|
|2014-08-27 11:30:...| 31|        Yes|        Don't know|
|2014-08-27 11:31:...| 33|         No|        Don't know|
|2014-08-27 11:31:...| 35|        Yes|Somewhat difficult|
|2014-08-27 11:32:...| 39|        Yes|        Don't know|
|2014-08-27 11:32:...| 42|         No|    Very difficult|
|2014-08-27 11:32:...| 31|        Yes|        Don't know|
|2014-08-27 11:33:...| 42|         No|Somewhat difficult|
|2014-08-27 11:33:...| 36|         No|        Don't know|
|2014-08-27 11:34:...| 32|         No|        Don't know|
|2014-08-27 11:34:...| 46|        Yes|         Very easy|
|2014-08-27 11:35:...| 36|        Yes|     Somewhat easy|
|2014-08-27 11:35:...| 31|        Yes|Somewhat difficult|
|2014-08-27 11:35:...| 46|        Yes|        Don't know|
|2014-08-27 11:36:...| 41|         No|        Don't know|
|2014-08-27 11:36:...| 33|         No|        Don't know|
|2014-08-27 11:37:...| 35|         No|         Very easy|
+--------------------+---+-----------+------------------+
only showing top 20 rows


scala> 

HDFS Admin Commands:



[hdfs@sandbox-hdp ~]$ hdfs cacheadmin
Usage: bin/hdfs cacheadmin [COMMAND]
          [-addDirective -path <path> -pool <pool-name> [-force] [-replication <replication>] [-ttl <time-to-live>]]
          [-modifyDirective -id <id> [-path <path>] [-force] [-replication <replication>] [-pool <pool-name>] [-ttl <time-to-live>]]
          [-listDirectives [-stats] [-path <path>] [-pool <pool>] [-id <id>]]
          [-removeDirective <id>]
          [-removeDirectives -path <path>]
          [-addPool <name> [-owner <owner>] [-group <group>] [-mode <mode>] [-limit <limit>] [-defaultReplication <defaultReplication>] [-maxTtl <maxTtl>]]
          [-modifyPool <name> [-owner <owner>] [-group <group>] [-mode <mode>] [-limit <limit>] [-defaultReplication <defaultReplication>] [-maxTtl <maxTtl>]]
          [-removePool <name>]
          [-listPools [-stats] [<name>]]
          [-help <command-name>]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs crypto
Usage: bin/hdfs crypto [COMMAND]
          [-createZone -keyName <keyName> -path <path>]
          [-listZones]
          [-provisionTrash -path <path>]
          [-getFileEncryptionInfo -path <path>]
          [-reencryptZone <action> -path <zone>]
          [-listReencryptionStatus]
          [-help <command-name>]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs debug
Usage: hdfs debug <command> [arguments]

These commands are for advanced users only.

Incorrect usages may result in data loss. Use at your own risk.

verifyMeta -meta <metadata-file> [-block <block-file>]
computeMeta -block <block-file> -out <output-metadata-file>
recoverLease -path <path> [-retries <num-retries>]



[hdfs@sandbox-hdp ~]$ hdfs dfsadmin
Usage: hdfs dfsadmin
Note: Administrative commands can only be run as the HDFS superuser.
        [-report [-live] [-dead] [-decommissioning] [-enteringmaintenance] [-inmaintenance]]
        [-safemode <enter | leave | get | wait>]
        [-saveNamespace [-beforeShutdown]]
        [-rollEdits]
        [-restoreFailedStorage true|false|check]
        [-refreshNodes]
        [-setQuota <quota> <dirname>...<dirname>]
        [-clrQuota <dirname>...<dirname>]
        [-setSpaceQuota <quota> [-storageType <storagetype>] <dirname>...<dirname>]
        [-clrSpaceQuota [-storageType <storagetype>] <dirname>...<dirname>]
        [-finalizeUpgrade]
        [-rollingUpgrade [<query|prepare|finalize>]]
        [-upgrade <query | finalize>]
        [-refreshServiceAcl]
        [-refreshUserToGroupsMappings]
        [-refreshSuperUserGroupsConfiguration]
        [-refreshCallQueue]
        [-refresh <host:ipc_port> <key> [arg1..argn]
        [-reconfig <namenode|datanode> <host:ipc_port> <start|status|properties>]
        [-printTopology]
        [-refreshNamenodes datanode_host:ipc_port]
        [-getVolumeReport datanode_host:ipc_port]
        [-deleteBlockPool datanode_host:ipc_port blockpoolId [force]]
        [-setBalancerBandwidth <bandwidth in bytes per second>]
        [-getBalancerBandwidth <datanode_host:ipc_port>]
        [-fetchImage <local directory>]
        [-allowSnapshot <snapshotDir>]
        [-disallowSnapshot <snapshotDir>]
        [-shutdownDatanode <datanode_host:ipc_port> [upgrade]]
        [-evictWriters <datanode_host:ipc_port>]
        [-getDatanodeInfo <datanode_host:ipc_port>]
        [-metasave filename]
        [-triggerBlockReport [-incremental] <datanode_host:ipc_port>]
        [-listOpenFiles [-blockingDecommission] [-path <path>]]
        [-help [cmd]]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs dfsrouteradmin
Not enough parameters specified
Federation Admin Tools:
        [-add <source> <nameservice1, nameservice2, ...> <destination> [-readonly] [-order HASH|LOCAL|RANDOM|HASH_ALL] -owner <owner> -group <group> -mode <mode>]
        [-update <source> <nameservice1, nameservice2, ...> <destination> [-readonly] [-order HASH|LOCAL|RANDOM|HASH_ALL] -owner <owner> -group <group> -mode <mode>]
        [-rm <source>]
        [-ls <path>]
        [-setQuota <path> -nsQuota <nsQuota> -ssQuota <quota in bytes or quota size string>]
        [-clrQuota <path>]
        [-safemode enter | leave | get]
        [-nameservice enable | disable <nameservice>]
        [-getDisabledNameservices]



[hdfs@sandbox-hdp ~]$ hdfs ec
Usage: bin/hdfs ec [COMMAND]
          [-listPolicies]
          [-addPolicies -policyFile <file>]
          [-getPolicy -path <path>]
          [-removePolicy -policy <policy>]
          [-setPolicy -path <path> [-policy <policy>] [-replicate]]
          [-unsetPolicy -path <path>]
          [-listCodecs]
          [-enablePolicy -policy <policy>]
          [-disablePolicy -policy <policy>]
          [-help <command-name>]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs fsck
Usage: hdfs fsck <path> [-list-corruptfileblocks | [-move | -delete | -openforwrite] [-files [-blocks [-locations | -racks | -replicaDetails | -upgradedomains]]]] [-includeSnapshots] [-showprogress] [-storagepolicies] [-maintenance] [-blockId <blk_Id>]
        <path>  start checking from this path
        -move   move corrupted files to /lost+found
        -delete delete corrupted files
        -files  print out files being checked
        -openforwrite   print out files opened for write
        -includeSnapshots       include snapshot data if the given path indicates a snapshottable directory or there are snapshottable directories under it
        -list-corruptfileblocks print out list of missing blocks and files they belong to
        -files -blocks  print out block report
        -files -blocks -locations       print out locations for every block
        -files -blocks -racks   print out network topology for data-node locations
        -files -blocks -replicaDetails  print out each replica details
        -files -blocks -upgradedomains  print out upgrade domains for every block
        -storagepolicies        print out storage policy summary for the blocks
        -maintenance    print out maintenance state node details
        -showprogress   show progress in output. Default is OFF (no progress)
        -blockId        print out which file this blockId belongs to, locations (nodes, racks) of this block, and other diagnostics info (under replicated, corrupted or not, etc)

Please Note:
        1. By default fsck ignores files opened for write, use -openforwrite to report such files. They are usually  tagged CORRUPT or HEALTHY depending on their block allocation status
        2. Option -includeSnapshots should not be used for comparing stats, should be used only for HEALTH check, as this may contain duplicates if the same file present in both original fs tree and inside snapshots.

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs haadmin
Usage: haadmin [-ns <nameserviceId>]
    [-transitionToActive [--forceactive] <serviceId>]
    [-transitionToStandby <serviceId>]
    [-failover [--forcefence] [--forceactive] <serviceId> <serviceId>]
    [-getServiceState <serviceId>]
    [-getAllServiceState]
    [-checkHealth <serviceId>]
    [-help <command>]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]


[hdfs@sandbox-hdp ~]$ hdfs jmxget
init: server=localhost;port=;service=NameNode;localVMUrl=null

Domains:
        Domain = JMImplementation
        Domain = com.sun.management
        Domain = java.lang
        Domain = java.nio
        Domain = java.util.logging

MBeanServer default domain = DefaultDomain

MBean count = 22

Query MBeanServer MBeans:
List of all the available keys:
[hdfs@sandbox-hdp ~]$
[hdfs@sandbox-hdp ~]$
[hdfs@sandbox-hdp ~]$
[hdfs@sandbox-hdp ~]$ hdfs oev
Usage: bin/hdfs oev [OPTIONS] -i INPUT_FILE -o OUTPUT_FILE
Offline edits viewer
Parse a Hadoop edits log file INPUT_FILE and save results
in OUTPUT_FILE.
Required command line arguments:
-i,--inputFile <arg>   edits file to process, xml (case
                       insensitive) extension means XML format,
                       any other filename means binary format.
                       XML/Binary format input file is not allowed
                       to be processed by the same type processor.
-o,--outputFile <arg>  Name of output file. If the specified
                       file exists, it will be overwritten,
                       format of the file is determined
                       by -p option

Optional command line arguments:
-p,--processor <arg>   Select which type of processor to apply
                       against image file, currently supported
                       processors are: binary (native binary format
                       that Hadoop uses), xml (default, XML
                       format), stats (prints statistics about
                       edits file)
-h,--help              Display usage information and exit
-f,--fix-txids         Renumber the transaction IDs in the input,
                       so that there are no gaps or invalid
                       transaction IDs.
-r,--recover           When reading binary edit logs, use recovery
                       mode.  This will give you the chance to skip
                       corrupt parts of the edit log.
-v,--verbose           More verbose output, prints the input and
                       output filenames, for processors that write
                       to a file, also output to screen. On large
                       image files this will dramatically increase
                       processing time (default is false).


Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]



[hdfs@sandbox-hdp ~]$ hdfs oiv
Usage: bin/hdfs oiv [OPTIONS] -i INPUTFILE -o OUTPUTFILE
Offline Image Viewer
View a Hadoop fsimage INPUTFILE using the specified PROCESSOR,
saving the results in OUTPUTFILE.

The oiv utility will attempt to parse correctly formed image files
and will abort fail with mal-formed image files.

The tool works offline and does not require a running cluster in
order to process an image file.

The following image processors are available:
  * XML: This processor creates an XML document with all elements of
    the fsimage enumerated, suitable for further analysis by XML
    tools.
  * ReverseXML: This processor takes an XML file and creates a
    binary fsimage containing the same elements.
  * FileDistribution: This processor analyzes the file size
    distribution in the image.
    -maxSize specifies the range [0, maxSize] of file sizes to be
     analyzed (128GB by default).
    -step defines the granularity of the distribution. (2MB by default)
    -format formats the output result in a human-readable fashion
     rather than a number of bytes. (false by default)
  * Web: Run a viewer to expose read-only WebHDFS API.
    -addr specifies the address to listen. (localhost:5978 by default)
    It does not support secure mode nor HTTPS.
  * Delimited (experimental): Generate a text file with all of the elements common
    to both inodes and inodes-under-construction, separated by a
    delimiter. The default delimiter is \t, though this may be
    changed via the -delimiter argument.

Required command line arguments:
-i,--inputFile <arg>   FSImage or XML file to process.

Optional command line arguments:
-o,--outputFile <arg>  Name of output file. If the specified
                       file exists, it will be overwritten.
                       (output to stdout by default)
                       If the input file was an XML file, we
                       will also create an <outputFile>.md5 file.
-p,--processor <arg>   Select which type of processor to apply
                       against image file. (XML|FileDistribution|
                       ReverseXML|Web|Delimited)
                       The default is Web.
-delimiter <arg>       Delimiting string to use with Delimited processor.
-t,--temp <arg>        Use temporary dir to cache intermediate result to generate
                       Delimited outputs. If not set, Delimited processor constructs
                       the namespace in memory before outputting text.
-h,--help              Display usage information and exit


[hdfs@sandbox-hdp ~]$ hdfs oiv_legacy
Usage: bin/hdfs oiv_legacy [OPTIONS] -i INPUTFILE -o OUTPUTFILE
Offline Image Viewer
View a Hadoop fsimage INPUTFILE using the specified PROCESSOR,
saving the results in OUTPUTFILE.

The oiv utility will attempt to parse correctly formed image files
and will abort fail with mal-formed image files.

The tool works offline and does not require a running cluster in
order to process an image file.

The following image processors are available:
  * Ls: The default image processor generates an lsr-style listing
    of the files in the namespace, with the same fields in the same
    order.  Note that in order to correctly determine file sizes,
    this formatter cannot skip blocks and will override the
    -skipBlocks option.
  * Indented: This processor enumerates over all of the elements in
    the fsimage file, using levels of indentation to delineate
    sections within the file.
  * Delimited: Generate a text file with all of the elements common
    to both inodes and inodes-under-construction, separated by a
    delimiter. The default delimiter is , though this may be
    changed via the -delimiter argument. This processor also overrides
    the -skipBlocks option for the same reason as the Ls processor
  * XML: This processor creates an XML document with all elements of
    the fsimage enumerated, suitable for further analysis by XML
    tools.
  * FileDistribution: This processor analyzes the file size
    distribution in the image.
    -maxSize specifies the range [0, maxSize] of file sizes to be
     analyzed (128GB by default).
    -step defines the granularity of the distribution. (2MB by default)
    -format formats the output result in a human-readable fashion
     rather than a number of bytes. (false by default)
  * NameDistribution: This processor analyzes the file names
    in the image and prints total number of file names and how frequently
    file names are reused.

Required command line arguments:
-i,--inputFile <arg>   FSImage file to process.
-o,--outputFile <arg>  Name of output file. If the specified
                       file exists, it will be overwritten.

Optional command line arguments:
-p,--processor <arg>   Select which type of processor to apply
                       against image file. (Ls|XML|Delimited|Indented|FileDistribution|NameDistribution).
-h,--help              Display usage information and exit
-printToScreen         For processors that write to a file, also
                       output to screen. On large image files this
                       will dramatically increase processing time.
-skipBlocks            Skip inodes' blocks information. May
                       significantly decrease output.
                       (default = false).
-delimiter <arg>       Delimiting string to use with Delimited processor


[hdfs@sandbox-hdp ~]$ hdfs storagepolicies
Usage: bin/hdfs storagepolicies [COMMAND]
          [-listPolicies]
          [-setStoragePolicy -path <path> -policy <policy>]
          [-getStoragePolicy -path <path>]
          [-unsetStoragePolicy -path <path>]
          [-help <command-name>]

Generic options supported are:
-conf <configuration file>        specify an application configuration file
-D <property=value>               define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port>  specify a ResourceManager
-files <file1,...>                specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...>               specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...>          specify a comma-separated list of archives to be unarchived on the compute machines

The general command line syntax is:
command [genericOptions] [commandOptions]

[hdfs@sandbox-hdp ~]$

HDFS Commands



[hdfs@sandbox-hdp ~]$ hdfs
Usage: hdfs [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]

  OPTIONS is none or any of:

--buildpaths                       attempt to add class files from build tree
--config dir                       Hadoop config directory
--daemon (start|status|stop)       operate on a daemon
--debug                            turn on shell script debug mode
--help                             usage information
--hostnames list[,of,host,names]   hosts to use in worker mode
--hosts filename                   list of hosts to use in worker mode
--loglevel level                   set the log4j level for this command
--workers                          turn on worker mode

  SUBCOMMAND is one of:


    Admin Commands:

cacheadmin           configure the HDFS cache
crypto               configure HDFS encryption zones
debug                run a Debug Admin to execute HDFS debug commands
dfsadmin             run a DFS admin client
dfsrouteradmin       manage Router-based federation
ec                   run a HDFS ErasureCoding CLI
fsck                 run a DFS filesystem checking utility
haadmin              run a DFS HA admin client
jmxget               get JMX exported values from NameNode or DataNode.
oev                  apply the offline edits viewer to an edits file
oiv                  apply the offline fsimage viewer to an fsimage
oiv_legacy           apply the offline fsimage viewer to a legacy fsimage
storagepolicies      list/get/set block storage policies

    Client Commands:

classpath            prints the class path needed to get the hadoop jar and the required libraries
dfs                  run a filesystem command on the file system
envvars              display computed Hadoop environment variables
fetchdt              fetch a delegation token from the NameNode
getconf              get config values from configuration
groups               get the groups which users belong to
lsSnapshottableDir   list all snapshottable dirs owned by the current user
snapshotDiff         diff two snapshots of a directory or diff the current directory contents with a snapshot
version              print the version

    Daemon Commands:

balancer             run a cluster balancing utility
datanode             run a DFS datanode
dfsrouter            run the DFS router
diskbalancer         Distributes data evenly among disks on a given node
httpfs               run HttpFS server, the HDFS HTTP Gateway
journalnode          run the DFS journalnode
mover                run a utility to move block replicas across storage types
namenode             run the DFS namenode
nfs3                 run an NFS version 3 gateway
portmap              run a portmap service
secondarynamenode    run the DFS secondary namenode
zkfc                 run the ZK Failover Controller daemon

SUBCOMMAND may print help when invoked w/o parameters or with -h.
[hdfs@sandbox-hdp ~]$



Admin Commands:


Client Commands:



Daemon Commands:

Ambari Commands








Ambari Status
/sbin/service ambari-server status
Start Ambari
/sbin/service ambari-server start
Stop Amabri
/sbin/service ambari-server stop
Reset ambari admin pwd
ambari-admin-password-reset
Ambari ldap sync
ambari-server sync-ldap --users users.txt --groups groups.txt
ambari-server --version
 Ambari version
ambari-server --hash
Ambari server hash value
Ambari-server backup
To take backup of ambari settings

Decommission a data node in HDP






From Ambari :



Manually :
[root@namenode1 conf]# pwd
/etc/hadoop/conf
[root@namenode1 conf]# ll | grep dfs.exclude
-rw-r--r--. 1 hdfs hadoop     1 Jan 12 15:32 dfs.exclude
[root@namenode1 conf]# cat dfs.exclude

[root@namenode1 conf]# vi dfs.exclude
[root@namenode1 conf]# cat dfs.exclude
edgenode.hdp.cn
[root@namenode1 conf]# su - hdfs
Last login: Sun Jan 12 15:33:08 EST 2020
[hdfs@namenode1 ~]$ hdfs dfsadmin -refreshNodes
Refresh nodes successful

[hdfs@namenode1 ~]$





Friday, February 2, 2018

Disable Resource Manager HA


  1. Stop YARN and Zookeeper services from ambari
  2. From Ambari-server :
                                           /var/lib/ambari-server/resources/scripts/configs.py --user=admin --     
                                                             password=admin --host=edgenode.hdp.cn --cluster=hdpdev --
                                                             action=get --config-      type=yarn-site -f yarn-site.json
  1. From the below change the first property to value "false" and remove the other properties from yarn-site.json

                                       a) "yarn.resourcemanager.ha.enabled": "false",
                                  b) "yarn.resourcemanager.ha.rm-ids": "rm1,rm2",
                                  c) "yarn.resourcemanager.hostname.rm1": "datanode1.hdp.cn",
                                  d) "yarn.resourcemanager.hostname.rm2": "edgenode.hdp.cn",
                                  e)  "yarn.resourcemanager.webapp.address.rm1": "datanode1.hdp.cn:8088",
                                  f)  "yarn.resourcemanager.webapp.address.rm2": "edgenode.hdp.cn:8088",
                                 g) "yarn.resourcemanager.webapp.https.address.rm1": "datanode1.hdp.cn:8090",
                                 h) "yarn.resourcemanager.webapp.https.address.rm2": "edgenode.hdp.cn:8090",
                                 i) "yarn.resourcemanager.cluster-id": "yarn-cluster",
                                 j) "yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election",

  1. Set the below properties to existing resource manager

                                   a)  "yarn.resourcemanager.hostname":
                               b) "yarn.resourcemanager.admin.address":
                               c) "yarn.resourcemanager.webapp.address":                                                             
                               d)"yarn.resourcemanager.resource-tracker.address":                 
                               e)"yarn.resourcemanager.scheduler.address":
                                f)"yarn.resourcemanager.webapp.https.address":
                                g)"yarn.timeline-service.webapp.address":
                                h)"yarn.timeline-service.webapp.https.address":
                                 i)"yarn.timeline-service.address":
                                 j)"yarn.log.server.url":

  1. Copy the yarn-site.json file back to the ambari-server and run the below command to set the changes made
                                            /var/lib/ambari-server/resources/scripts/configs.py --user=admin --password=admin --                                           host=edgenode.hdp.cn --cluster=hdpdev --action=get --config-type=yarn-site

  1. Delete the Resource Manager host comonent
                                         curl --user admin:admin -i -H "X-Requested-By: ambari" -X  DELETE  
                                    http://edgenode.hdp.cn:8080/api/v1/clusters/hdpdev/hosts/edgenode.hdp.cn/
                                    host_components/RESOURCEMANAGER
                                                                                                                                                                                                        
  1. Start zookeeper service from Ambari.

  1. On  zookeeper clients run the below command to change the znode permissions
                                       /usr/hdp/current/zookeeper-client/bin/zkCli.sh getAcl /rmstore/ZKRMStateRoot
                                 /usr/hdp/current/zookeeper-client/bin/zkCli.sh setAcl /rmstore/ZKRMStateRoot                                                       world:anyone:rwcda

  1. From Ambari UI, restart zookeeper service and YARN service