Hiveee

A.
Create Database
------------------
create database retail;
B. Select Database
------------------
use retail;
C. Create table for storing transactional records

-------------------------------------------------
create table sales_data (TransID INT, TransDate STRING, Product STRING,
Price DOUBLE, PaymentType STRING, CustName STRING, City STRING, State
STRING, Country STRING) row format delimited fields terminated by ','
stored as textfile;
LOAD DATA INPATH '/user/hdpuser/pigoutput' INTO TABLE sales_data;
D. Describing metadata or schema of the table

---------------------------------------------
describe sales_data;
alter table sales_data SET TBLPROPERTIES('EXTERNAL'='TRUE');
E. Load the data into the table

-------------------------------
F. Counting no of records
-------------------------
select count(*) from sales_data;
G. Counting total spending by product

-------------------------------------
select product, sum(price) from sales_data group by product;
H. 10 customers
--------------------
select custname, sum(price) from sales_data group by custname limit 10;
I. Create partitioned table

---------------------------
create external table sales_by_city (TransID INT, TransDate STRING,

Product STRING, Price DOUBLE, PaymentType STRING, CustName STRING, State
STRING, Country STRING) partitioned by (City STRING) row format delimited
fields terminated by ',' stored as textfile;
describe formatted sales_by_city;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
from sales_data txn INSERT OVERWRITE TABLE sales_by_city PARTITION(city)
select txn.TransID,
txn.TransDate,txn.Product,txn.Price,txn.PaymentType,txn.CustName,txn.Stat
e,txn.Country,txn.city;
create table txnrecsByCity(txnno INT, txndate STRING, custno INT, amount

DOUBLE, product STRING, city STRING, state STRING, spendby STRING)
partitioned by (category STRING) clustered by (state) INTO 10 buckets row
format delimited fields terminated by ',' stored as textfile;
describe formatted txnrecsByCat;
J. Configure Hive to allow partitions

-------------------------------------
However, a query across all partitions could trigger an enormous

MapReduce job if the table data and number of partitions are large. A
highly suggested safety measure is putting Hive into strict mode, which
prohibits queries of partitioned tables without a WHERE clause that
filters on partitions. You can set the mode to nonstrict, as in the
following session:
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
K. Load data into partition table

----------------------------------
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
from txnrecords txn INSERT OVERWRITE TABLE txnrecsByCat

PARTITION(category) select txn.txnno, txn.txndate,txn.custno,
txn.amount,txn.product,txn.city,txn.state, txn.spendby, txn.category
DISTRIBUTE BY category;
==========================
find sales based on age group
==========================
create table customer(custno string, firstname string, lastname string,

age int,profession string) row format delimited fields terminated by ',';
load data local inpath '/home/hdpuser/custs.txt' into table customer;
create table out1 (custno int,firstname string,age int,profession

string,amount double,product string) row format delimited fields
terminated by ',';
insert overwrite table out1 select
a.custno,a.firstname,a.age,a.profession,b.amount,b.product from customer
a JOIN txnrecords b ON a.custno = b.custno;
select * from out1 limit 100;
create table out2 (custno int,firstname string,age int,profession

string,amount double,product string, level string) row format delimited
fields terminated by ',';
insert overwrite table out2

select * , case
when age<30 then 'young'
when age>=30 and age < 50 then 'middle'
when age>=50 then 'old'
else 'others'
end
from out1;
describe out2;
create table out3 (level string, amount double) row format delimited
insert overwrite table out3 select level,sum(amount) from out2 group by

level;
==============
simple join
==============
create table employee(name string, salary float,city string) row format

delimited fields terminated by ',';
load data local inpath '/home/hdpuser/emp.txt' into table employee;
select * from employee where name='tarun';
create table mailid (name string, email string) row format delimited
load data local inpath '/home/hdpuser/email.txt' into table mailid;
select a.name,a.city,a.salary,b.email from employee a join mailid b on

a.name = b.name;
select a.name,a.city,a.salary,b.email from employee a left outer join

mailid b on a.name = b.name;
select a.name,a.city,a.salary,b.email from employee a right outer join
select a.name,a.city,a.salary,b.email from employee a full outer join

===============================================
Custom Mapper Code to manipulate unix timestamp
===============================================
CREATE TABLE u_data ( userid INT, movieid INT, rating INT, unixtime
STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS
TEXTFILE;
And load it into the table that was just created:
LOAD DATA LOCAL INPATH '/home/hdpuser/u.data.txt' OVERWRITE INTO TABLE

u_data;
Count the number of rows in table u_data:
SELECT COUNT(*) FROM u_data;
****Create weekday_mapper.py:
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday =
datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
CREATE TABLE u_data_new (userid INT, movieid INT, rating INT, weekday
INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
add FILE /home/hdpuser/weekday_mapper.py;
****Note that columns will be transformed to string and delimited

****by TAB before feeding to the user script, and the standard output
****of the user script will be treated as TAB-separated string columns.
****The following command uses the TRANSFORM clause to embed the mapper
scripts.
INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (userid, movieid,

rating, unixtime) USING 'python weekday_mapper.py' AS (userid, movieid,
rating, weekday) FROM u_data;
SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;
===========
UDF
===========
import java.util.Date;
import java.text.DateFormat;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class UnixtimeToDate extends UDF{
public Text evaluate(Text text){
if(text==null) return null;
long timestamp = Long.parseLong(text.toString());
return new Text(toDate(timestamp));
}
private String toDate(long timestamp) {
Date date = new Date (timestamp*1000);
return DateFormat.getInstance().format(date).toString();
}
}
/usr/bin/javac -classpath /usr/local/hadoop-

2.6.4/share/hadoop/common/hadoop-common-2.6.4.jar:/etc/hadoop/apache-
hive-0.13.0-bin/apache-hive-0.13.0-bin/lib/hive-exec-0.13.0.jar
UnixtimeToDate.java
****Pack this class file into a jar:

$/usr/bin/jar -cvf convert.jar UnixtimeToDate.class
****Verify jar using command :

$/usr/bin/jar -tvf convert.jar
****add this jar in hive prompt

ADD JAR /home/hdpuser/convert.jar;
****Then you create your custom function as follows:

create temporary function userdate as 'UnixtimeToDate';
****one,1386023259550
****two,1389523259550
****three,1389523259550
****four,1389523259550
create table testing(id string,id_time string) row format delimited

load data inpath '/data/counter' into table testing;
hive> select * from testing;

****OK
****one 1386023259550
****two 1389523259550
****three 1389523259550
****four 1389523259550
****Then use function 'userdate' in sql command

select id,userdate(id_time) from testing;
****OK
****four 3/28/02 8:12 PM
****one 4/30/91 1:59 PM
****two 3/28/02 8:12 PM
****three 3/28/02 8:12 PM

Hiveee

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Hiveee

Uploaded by

Copyright:

Available Formats

A.

C. Create table for storing transactional records

LOAD DATA INPATH '/user/hdpuser/pigoutput' INTO TABLE sales_data;

D. Describing metadata or schema of the table

alter table sales_data SET TBLPROPERTIES('EXTERNAL'='TRUE');

E. Load the data into the table

G. Counting total spending by product

I. Create partitioned table

create external table sales_by_city (TransID INT, TransDate STRING,

describe formatted sales_by_city;

create table txnrecsByCity(txnno INT, txndate STRING, custno INT, amount

describe formatted txnrecsByCat;

J. Configure Hive to allow partitions

However, a query across all partitions could trigger an enormous

K. Load data into partition table

from txnrecords txn INSERT OVERWRITE TABLE txnrecsByCat

create table customer(custno string, firstname string, lastname string,

load data local inpath '/home/hdpuser/custs.txt' into table customer;

create table out1 (custno int,firstname string,age int,profession

select * from out1 limit 100;

create table out2 (custno int,firstname string,age int,profession

insert overwrite table out2

select * from out2 limit 100;

insert overwrite table out3 select level,sum(amount) from out2 group by

select * from out3 limit 100;

create table employee(name string, salary float,city string) row format

load data local inpath '/home/hdpuser/emp.txt' into table employee;

select * from employee where name='tarun';

load data local inpath '/home/hdpuser/email.txt' into table mailid;

select a.name,a.city,a.salary,b.email from employee a join mailid b on

select a.name,a.city,a.salary,b.email from employee a left outer join

select a.name,a.city,a.salary,b.email from employee a full outer join

And load it into the table that was just created:

LOAD DATA LOCAL INPATH '/home/hdpuser/u.data.txt' OVERWRITE INTO TABLE

Count the number of rows in table u_data:

SELECT COUNT(*) FROM u_data;

add FILE /home/hdpuser/weekday_mapper.py;

****Note that columns will be transformed to string and delimited

INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (userid, movieid,

SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;

/usr/bin/javac -classpath /usr/local/hadoop-

****Pack this class file into a jar:

****Verify jar using command :

****add this jar in hive prompt

****Then you create your custom function as follows:

create table testing(id string,id_time string) row format delimited

load data inpath '/data/counter' into table testing;

hive> select * from testing;

****Then use function 'userdate' in sql command

You might also like