You are on page 1of 22

HOW TO CREATE DATABASE

-----------------------------
CREATE DATABASE IF NOT EXISTS deepak;

CREATE DATABASE IF NOT EXISTS deepak


COMMENT 'My database name is deepak'
LOCATION '/hive/deepak'
WITH DBPROPERTIES ('key1' = 'value1', 'key2' = 'value2');

DESCRIBE DATABASE deepak;

DESCRIBE DATABASE EXTENDED deepak;

HOW TO DROP DATABASE


-----------------------------
DROP DATABASE IF EXISTS deepak;

DROP DATABASE IF EXISTS deepak CASCADE;

DROP DATABASE IF EXISTS deepak RESTRICT;

HOW TO USE DATABASE


-----------------------------
USE deepak;

HOW TO ALTER DATABASE


-----------------------------
ALTER DATABASE deepak SET DBPROPERTIES ('key1' = 'value11', 'key3' = 'value3');

ALTER DATABASE deepak SET OWNER USER hadoop;

ALTER DATABASE deepak SET OWNER ROLE hadoop;

HOW TO CREATE TABLE


-----------------------------
CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
);

CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TABLE IF NOT EXISTS student2 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE EXTERNAL TABLE IF NOT EXISTS student3 ( name string, id int , course string,
year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TEMPORARY TABLE IF NOT EXISTS student4 ( name string, id int , course
string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

HOW TO KNOW INFORMATION ABOUT TABLE


-----------------------------
DESCRIBE tbl_name;

DESCRIBE EXTENDED tbl_name;

DESCRIBE FORMATTED tbl_name;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student1;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student2;

HOW TO LOAD DATA INTO TABLE


-----------------------------
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student2;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE


student2;

hadoop fs -mkdir -p /home/deepak/work/hive_inputs

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/home/deepak/work/hive_inputs/student.txt

LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO TABLE


student2;

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/home/deepak/work/hive_inputs/student.txt

LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE student2;


HOW TO USE EXTERNAL TABLE
-----------------------------
hadoop fs -mkdir /hive/externaldata

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/hive/externaldata/student.txt

CREATE EXTERNAL TABLE IF NOT EXISTS studentext1


( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

SELECT * FROM studentext1;

DESCRIBE FORMATTED studentext1;

CREATE EXTERNAL TABLE IF NOT EXISTS studentext2


( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/externaldata';

SELECT * FROM studentext2;

DESCRIBE FORMATTED studentext2;

CREATE TABLE IF NOT EXISTS student


(
name string comment 'student name',
id int comment 'student id',
course string comment 'student course',
year int comment 'student year'
)
COMMENT 'My table name is student'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student;

INSERT INTO TABLE employee VALUES (1, 'mca', ‘1’), (2, 'mca', ‘1’), (3, 'mca',
‘2’), (4, 'mca', ‘2’);
set ;

set hive.cli.print.current.db=true;

set hive.cli.print.header=true;

INSERT INTO TABLE student VALUES ('arun', 1, 'mca', 1), ('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2), ('santosh', 4, 'mca', 2);

UPDATE student SET year = 3, name = 'san' WHERE id = 4 ;

HOW TO VERIFY SCHEMA IN METASTORE DATABASE


------------------------------------------
USE hive_mysql_db;
SELECT * FROM DBS;
SELECT * FROM TBLS;

SELECT * FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;


SELECT DBS.NAME, TBLS.TBL_NAME FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;

COMPLEX DATA TYPES


------------------------------------------
array_type : ARRAY < data_type >
map_type : MAP < primitive_type, data_type >
struct_type : STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type : UNIONTYPE < data_type, data_type, ... >

CREATE TABLE IF NOT EXISTS studentdata


(
name string comment 'student name',
subjects array<string> comment 'student subjects',
marks map<string,int> comment 'student marks',
address struct< loc : string comment 'student location', pincode : int comment
'student pincode', city : string comment 'student city'>,
details uniontype<double, int, array<string>, struct<a:int,b:string>, boolean>
comment 'student details'
)
COMMENT 'My table name is studentdata'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/studentdata'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;

sample data:
--------------------
venkat#math,phy,chem#math:40,phy:23,chem:60#kp,500004,pune#0,75.6
kumar#math,phy,chem#math:50,phy:20,chem:40#mundwa,500005,pune#1,78
rahul#math,phy,chem#math:30,phy:22,chem:30#hinjewadi,500003,pune#2,hi:hello
anil#math,phy,chem#math:13,phy:30,chem:60#tingre nagar,500002,pune#3,1:rank
arun#math,phy,chem#math:10,phy:20,chem:50#aundh,500001,pune#4,true

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/studentdata.txt' OVERWRITE


INTO TABLE studentdata;

SELECT * FROM studentdata;

SELECT name, subjects[0], marks['math'], address.pincode, details FROM


studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata'


SELECT * FROM studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_1'


SELECT name, subjects[0], marks['math'], address.pincode, details FROM
studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_2'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
SELECT * FROM studentdata;

HOW TO CREATE PARTITION TABLE


------------------------------------------
CREATE TABLE IF NOT EXISTS student_partition ( name string , id int )
PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
;

HOW TO WRITE HIVE STATIC PARTITION QUERIES


--------------------------------------------------------------
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_1.txt' OVERWRITE
INTO TABLE student_partition PARTITION(course = 'cse', year = 1);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_2.txt' OVERWRITE


INTO TABLE student_partition PARTITION(course = 'cse', year = 2);

HOW TO WRITE PARTITION TABLE QUERIES


--------------------------------------------------------------
SHOW PARTITIONS student_partition;

SELECT * FROM student_partition;

SELECT * FROM student_partition WHERE year = 1;


SELECT * FROM student_partition WHERE course = 'cse';

SELECT * FROM student_partition WHERE course = 'mech' and year = 1;


SELECT * FROM student_partition WHERE course = 'mech' or year = 1;

HOW TO WRITE HIVE DYNAMIC PARTITION QUERIES


--------------------------------------------------------------
set hive.exec.dynamic.partition.mode=nonstrict;

INSERT OVERWRITE TABLE student_partition PARTITION(course, year) select * from


student;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE


student;

INSERT INTO TABLE student_partition PARTITION(course = 'it', year) SELECT name, id,
year FROM student WHERE course = 'it';

INSERT INTO TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT


name, id FROM student WHERE course = 'it' AND year = 3;

INSERT OVERWRITE TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT


name, id FROM student WHERE course = 'it' AND year = 3;

INSERT INTO TABLE student_partition PARTITION(course = 'mca', year = 1) VALUES


('arun', 1), ('anil', 2), ('sudheer', 3), ('santosh', 4);

BELOW HIVE QUERIES ARE NOT ALLOWED


--------------------------------------------------------------
Note:- Dynamic partition cannot be the parent of a static partition '3'

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course = 'cse', year);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course, year = 1);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course, year);

INSERT INTO TABLE student_partition PARTITION(course , year = 3) SELECT name, id,


course FROM student WHERE year = 3;

DIFFERENT WAYS TO EXECUTE HIVE QUERIES


--------------------------------------------------------------
hive -e 'USE deepak; SELECT * FROM student;'
hive -S -e 'USE deepak; SELECT * FROM student;'

hadoop@hadoop:~$ hive -f '/home/deepak/work/hive_inputs/partition.hql';


hive> source /home/deepak/work/hive_inputs/partition.hql;

hadoop@hadoop:~$ hadoop fs -ls /


hive> dfs -ls / ;
hadoop@hadoop:~$ ls -l /home
hive> ! ls -l /home;

--------------------------------------------------------------
HOW TO WRITE BUCKET QUERIES IN HIVE
--------------------------------------------------------------

CREATE EXTERNAL TABLE IF NOT EXISTS users ( name string, id int)


ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/users.txt' OVERWRITE INTO


TABLE users;

SELECT * FROM users;

venkat 3
raj 2
appu 4
anil 8
anvith 9
sony 1
nani 6
rani 7
rajesh 10
lg 5

CREATE TABLE IF NOT EXISTS users1 ( name string, id int) CLUSTERED BY (id) INTO 4
BUCKETS;

CREATE TABLE IF NOT EXISTS users2 ( name string, id int) CLUSTERED BY (id) SORTED
BY (id DESC) INTO 5 BUCKETS;

set hive.enforce.bucketing=true;

INSERT OVERWRITE TABLE users1 SELECT * FROM users;


INSERT OVERWRITE TABLE users2 SELECT * FROM users;

or

FROM users
INSERT OVERWRITE TABLE users1 SELECT *
INSERT OVERWRITE TABLE users2 SELECT *;

--------------------------------------------------------------
HOW THE LOGIC OF BUCKET WORKS
--------------------------------------------------------------
x = hash_code(bucket_col) % no_of_buckets

bucket m out of n => if ( x + 1 == m) then print ( bucket_col )


bucket m out of n => if ( x == m - 1) then print ( bucket_col )

1 % 5 => 1
2 % 5 => 2
3 % 5 => 3
4 % 5 => 4
5 % 5 => 0
6 % 5 => 1
7 % 5 => 2
8 % 5 => 3
9 % 5 => 4
10 % 5 => 0

part_0 => 5,10 => bucket 1 out of 5


part_1 => 1,6 => bucket 2 out of 5
part_2 => 2,7 => bucket 3 out of 5
part_3 => 3,8 => bucket 4 out of 5
part_4 => 3,9 => bucket 5 out of 5

1 % 4 => 1
2 % 4 => 2
3 % 4 => 3
4 % 4 => 0
5 % 4 => 1
6 % 4 => 2
7 % 4 => 3
8 % 4 => 0
9 % 4 => 1
10 % 4 => 2

part_0 => 4,8 => bucket 1 out of 4


part_1 => 1,5,9 => bucket 2 out of 4
part_2 => 2,6,10 => bucket 3 out of 4
part_3 => 3,7 => bucket 4 out of 4

1 % 3 => 1
2 % 3 => 2
3 % 3 => 0
4 % 3 => 1
5 % 3 => 2
6 % 3 => 0
7 % 3 => 1
8 % 3 => 2
9 % 3 => 0
10 % 3 => 1

part_0 => 3,6,9 => bucket 1 out of 3


part_1 => 1,4,7,10 => bucket 2 out of 3
part_2 => 2,5,8 => bucket 3 out of 3

1 % 2 => 1
2 % 2 => 0
3 % 2 => 1
4 % 2 => 0
5 % 2 => 1
6 % 2 => 0
7 % 2 => 1
8 % 2 => 0
9 % 2 => 1
10 % 2 => 0

part_0 => 2,4,6,8,10 => bucket 1 out of 2


part_1 => 1,3,5,7,9 => bucket 2 out of 2
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on id );
anil 8
appu 4

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 3 on id );


anvith 9
nani 6
venkat 3

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 2 on id );


anil 8
appu 4
rajesh 10
nani 6
raj 2

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on rand() );

Buckets:
----------------------------------------
1. no. of buckets is fixed
2. we can't change the bucket number later
3. we will use only for static data

Partitions:
----------------------------------------
1. no.of partition columns are fixed
2. we can't change the partition columns later
3. we can use for static / dynamic data also
4. static data (static partition) / dynamic data (dynamic partition)

Hive => MapReduce


----------------------------------------
Partition => MultipleOutputs
Bucket => HashPartitioner

Note: Limitations of `HashPartitioner` is no.of partitions are fixed. we can solve


this problem using `MultipleOutputs`

--------------------------------------------------------------
HOW TO WRITE JOIN QUERIES IN HIVE
--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS products ( name string, id int , price int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TABLE IF NOT EXISTS sales ( name string, year int , percentage double )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/products.txt' OVERWRITE INTO


TABLE products;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/sales.txt' OVERWRITE INTO


TABLE sales;

select * from products;


hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600
spark 5 1500

select * from sales;


hadoop 2010 10.0
hadoop 2011 40.5
hadoop 2012 60.0
hive 2012 40.5
hive 2013 20.0
pig 2014 30.0
pig 2010 10.0
hbase 2014 70.5
mongodb 2013 50.5

--------------------------------------------------------------
REDUCE-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------

HOW TO WRITE INNER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products JOIN sales ON products.name = sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5

HOW TO WRITE LEFT OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products LEFT OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
spark 5 1500 NULL NULL NULL

HOW TO WRITE RIGHT OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products RIGHT OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
NULL NULL NULL mongodb 2013 50.5

HOW TO WRITE FULL OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products FULL OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2012 60.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2010 10.0
hbase 4 600 hbase 2014 70.5
hive 2 500 hive 2013 20.0
hive 2 500 hive 2012 40.5
NULL NULL NULL mongodb 2013 50.5
pig 3 750 pig 2010 10.0
pig 3 750 pig 2014 30.0
spark 5 1500 NULL NULL NULL

--------------------------------------------------------------
MAP-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------

WHICH MAP-SIDE-JOIN QUERIES ARE ALLOWED ?


--------------------------------------------------------------
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products JOIN sales ON
products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products JOIN sales ON


products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;

WHICH MAP-SIDE-JOIN QUERIES ARE NOT ALLOWED ?


--------------------------------------------------------------
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;

Note:
------------
inner join => mapjoin(left table) & mapjoin(right table) are allowed

left outer join => mapjoin(left table) not allowed & mapjoin(right table) allowed

right outer join => mapjoin(left table) allowed & mapjoin(right table) not allowed

full outer join => mapjoin(left table) & mapjoin(right table) are not allowed

--------------------------------------------------------------
SEMI-JOIN QUERIES IN HIVE
--------------------------------------------------------------
SELECT products.* from products LEFT SEMI JOIN sales ON products.name = sales.name;

hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600

--------------------------------------------------------------
HOW TO WRITE CUSTOM UDF QUERIES IN HIVE
--------------------------------------------------------------

SHOW FUNCTIONS;

DESCRIBE FUNCTION split;

DESCRIBE FUNCTION EXTENDED split;

--------------------------------------------------------------
WORDCOUNT SOLUTION IN HIVE USING HIVE FUNCTIONS
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS docs(line string);

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/demoinput' OVERWRITE INTO


TABLE docs;

SELECT * FROM docs;

CREATE TABLE word_counts AS


SELECT word, count(1) as cnt FROM (SELECT explode(split(line,' ')) as word FROM
docs) w
GROUP BY word ORDER BY word;

SELECT * FROM word_counts;

--------------------------------------------------------------

ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;

LIST JARS;

DELETE JAR /home/deepak/work/hive_inputs/hive-examples.jar;

--------------------------------------------------------------

CREATE TEMPORARY FUNCTION md5sum AS


'com.deepak.deepak.hadoop.training.hive.udf.MD5SumUDF';

SELECT name, md5sum(name) FROM student;

DROP TEMPORARY FUNCTION IF EXISTS md5sum;

--------------------------------------------------------------

hadoop fs -put /home/deepak/work/hive_inputs/hive-examples.jar /hive/hive-


examples.jar

CREATE FUNCTION deepak.double_my_data AS


'com.deepak.deepak.hadoop.training.hive.udf.DoubleMyDataUDF' USING JAR
'hdfs:///hive/hive-examples.jar';

SELECT name, deepak.double_my_data(name, ":") FROM student;

DROP FUNCTION IF EXISTS deepak.double_my_data;

--------------------------------------------------------------
HOW TO WRITE SERDE QUERIES IN HIVE
--------------------------------------------------------------
What is a SerDe?
=> SerDe is a short name for "Serializer and Deserializer."
=> Hive uses SerDe (and FileFormat) to read and write table rows.
=> HDFS files --> InputFileFormat --> <key, value> --> Deserializer --> Row
object
=> Row object --> Serializer --> <key, value> --> OutputFileFormat --> HDFS
files

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_regex_serde_new


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'


OVERWRITE INTO TABLE student_regex_serde_new;

SELECT * FROM student_regex_serde_new WHERE year = 1;

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_regex_serde_old


( name string, id string , course string, year string )
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'


OVERWRITE INTO TABLE student_regex_serde_old;

SELECT * FROM student_regex_serde_old WHERE year = 1;

--------------------------------------------------------------
HOW TO WRITE CUSTOM SERDE QUERIES IN HIVE
--------------------------------------------------------------

ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;

DROP TABLE student_multiply_serde;

CREATE TABLE IF NOT EXISTS student_multiply_serde


( name string, id int , course string, year int )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnMultiplySerDe'
WITH SERDEPROPERTIES ("number.multiply" = "3", "text.multiply" = "2")
;

DESCRIBE FORMATTED student_multiply_serde;

INSERT OVERWRITE TABLE student_multiply_serde SELECT * FROM student WHERE year = 1;

SELECT * FROM student WHERE year = 1;

SELECT * FROM student_multiply_serde WHERE year = 1;

--------------------------------------------------------------

DROP TABLE student_map_serde;

CREATE TABLE IF NOT EXISTS student_map_serde


( name string, id string , course string, year string )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnarMapSerDe'
;

INSERT OVERWRITE TABLE student_map_serde SELECT * FROM student WHERE year = 1;

SELECT * FROM student_map_serde WHERE year = 1;

--------------------------------------------------------------
Regex Serde for apache log
--------------------------------------------------------------
CREATE TABLE apache_log (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|
[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/apache_clf.txt' OVERWRITE INTO


TABLE apache_log;

--------------------------------------------------------------
DIFFERENT WAYS TO STORE DATA USING HIVE QUERIES
--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_text


( name string, id int , course string, year int ) STORED AS TEXTFILE;

or

CREATE TABLE IF NOT EXISTS student_text


( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_seq


( name string, id int , course string, year int ) STORED AS SEQUENCEFILE;

or

CREATE TABLE IF NOT EXISTS student_seq


( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_rc


( name string, id int , course string, year int ) STORED AS RCFILE;

or
CREATE TABLE IF NOT EXISTS student_rc
( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_orc


( name string, id int , course string, year int ) STORED AS ORC;

or

CREATE TABLE IF NOT EXISTS student_orc


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_avro


( name string, id int , course string, year int ) STORED AS AVRO;

or

CREATE TABLE IF NOT EXISTS student_avro


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_parquet


( name string, id int , course string, year int ) STORED AS PARQUET;

or

CREATE TABLE IF NOT EXISTS student_parquet


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';

--------------------------------------------------------------

FROM STUDENT
INSERT OVERWRITE TABLE student_text SELECT *
INSERT OVERWRITE TABLE student_seq SELECT *
INSERT OVERWRITE TABLE student_rc SELECT *
INSERT OVERWRITE TABLE student_orc SELECT *
INSERT OVERWRITE TABLE student_avro SELECT *
INSERT OVERWRITE TABLE student_parquet SELECT *
;
--------------------------------------------------------------
HOW TO WRITE HIVE QUERIES USING TRANSFORM
--------------------------------------------------------------
SELECT TRANSFORM(name, id, year) USING '/bin/cat' AS (name, id, year) FROM student;

SELECT TRANSFORM(name, id, course) USING '/bin/grep cse' AS (name, id, course) FROM
student; .//grep serching the speciffic word

SELECT TRANSFORM(name, id, year, course) USING 'python


/home/deepak/work/hive_inputs/student.py' AS (name, id, year, course) FROM student;

--------------------------------------------------------------
HOW TO WRITE ALTER QUERIES IN HIVE
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS test ( a string, b int , c int );

CREATE TABLE IF NOT EXISTS test1 LIKE test;

ALTER TABLE test1 RENAME TO mytest;


hive> describe mytest;
a string
b int
c int

ALTER TABLE mytest REPLACE COLUMNS ( x string, y double, a string, b int , c


float);
hive> describe mytest;
x string
y double
a string
b int
c float

ALTER TABLE mytest REPLACE COLUMNS (x string, y double);


hive> describe mytest;
x string
y double

ALTER TABLE mytest ADD COLUMNS (c1 INT COMMENT 'c1 column', c2 STRING COMMENT 'c2
column');
hive> describe mytest;
x string
y double
c1 int c1 column
c2 string c2 column

ALTER TABLE mytest CHANGE c1 c1 STRING COMMENT 'new c1 column';


hive> describe mytest;
x string
y double
c1 string new c1 column
c2 string c2 column

ALTER TABLE mytest CHANGE c1 c11 STRING COMMENT 'new c11 column';
hive> describe mytest;
x string
y double
c11 string new c11 column
c2 string c2 column

ALTER TABLE mytest CHANGE c2 c21 STRING COMMENT 'new c21 column' after x;
hive> describe mytest;
x string
c21 string new c21 column
y double
c11 string new c11 column

ALTER TABLE mytest CHANGE c21 c2 STRING COMMENT 'new c2 column' FIRST;
hive> describe mytest;
c2 string new c2 column
x string
y double
c11 string new c11 column

ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'add comment');

ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'new comment');

ALTER TABLE mytest SET TBLPROPERTIES ('key1' = 'value1', 'key2' = 'value2');

ALTER TABLE mytest SET SERDEPROPERTIES ('field.delim' = ',');

ALTER TABLE mytest SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' WITH


SERDEPROPERTIES ('field.delim' = ':');

ALTER TABLE mytest ADD PARTITION (partCol = 'value1') location 'loc1';


ALTER TABLE mytest ADD PARTITION (partCol = 'value2') location 'loc2';

or

ALTER TABLE mytest ADD


PARTITION (partCol = 'value1') location 'loc1'
PARTITION (partCol = 'value2') location 'loc2';

--------------------------------------------------------------
DIFFERENT WAYS TO RUN HIVE QUERIES
--------------------------------------------------------------

Running Hive CLI


--------------------------------------------------------------
$HIVE_HOME/bin/hive

//diff beel and hive beeli use in cluster jdbc connectivity

Running HiveServer2 and Beeline


--------------------------------------------------------------
$HIVE_HOME/bin/hiveserver2
$HIVE_HOME/bin/beeline -u jdbc:hive2://localhost:10000/default -n deepak -d
org.apache.hive.jdbc.HiveDriver

Go for this more info:


https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients

Running HCatalog
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/hcat_server.sh
$HIVE_HOME/hcatalog/bin/hcat

Running WebHCat
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh start
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh stop

http://localhost:50111/templeton/v1

http://localhost:50111/templeton/v1/ddl/database/default/table/student?
user.name=hadoop

Role Management Commands //authoriazation


--------------------------------------------------------------
CREATE ROLE deepak;

DROP ROLE deepak;

SHOW ROLES;

SHOW CURRENT ROLES;

--------------------------------------------------------------
ACID functionality in HIVE
--------------------------------------------------------------
Add the below properties in "hive-site.xml" file & restart hive server

<property>
<name>hive.in.test</name>
<value>false</value>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>

<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>2</value>
</property>

CREATE HIVE TABLE WITH CLUSTERED BY, ORC, TBLPROPERTIES


--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_acid
( name string, id int, course string, year int )
CLUSTERED BY (name) INTO 4 BUCKETS
STORED AS ORC
LOCATION '/hive/deepak/student_acid'
TBLPROPERTIES ('transactional' = 'true');

INSERT INTO TABLE student_acid VALUES


('arun', 1, 'mca', 1),
('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2),
('santosh', 4, 'mca', 2)
;

UPDATE student_acid
SET year = 3, course = 'mech'
WHERE id = 4 ;

DELETE FROM student_acid WHERE name = 'anil';

==================================================================
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE
student_partition PARTITION(course, year);

CREATE TABLE IF NOT EXISTS student_partition_test ( name string , id int )


PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student_partition_test'
;

ALTER TABLE student_partition_test ADD PARTITION (course = 'cse' , year = 1)


location '/hive/deepak/student_partition_test/course=cse/year=1';

==================================================================

use deepak;

add jar /home/deepak/work/deepak-pdf-utils.jar;

drop table student_pdf;

CREATE TABLE IF NOT EXISTS student_pdf


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ("field.delim" = " ")
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
;

LOAD DATA LOCAL INPATH '/home/deepak/work/input/student.pdf' OVERWRITE INTO TABLE


student_pdf;

select * from student_pdf;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
SELECT * FROM student_pdf;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op-1'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':'
SELECT * FROM student_pdf;

==================================================================

hadoop jar /home/deepak/work/deepak-pdf-utils.jar


com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
file:///home/deepak/work/input/student.pdf file:///home/deepak/work/output/mr-op

hadoop jar /home/deepak/work/deepak-pdf-utils.jar


com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
-Dmapred.pdfoutputformat.separator=*** file:///home/deepak/work/input/student.pdf
file:///home/deepak/work/output/mr-op-1

==================================================================

You might also like