You are on page 1of 6

A.

Create Database
------------------
create database retail;

B. Select Database
------------------
use retail;

C. Create table for storing transactional records


-------------------------------------------------
create table sales_data (TransID INT, TransDate STRING, Product STRING,
Price DOUBLE, PaymentType STRING, CustName STRING, City STRING, State
STRING, Country STRING) row format delimited fields terminated by ','
stored as textfile;

LOAD DATA INPATH '/user/hdpuser/pigoutput' INTO TABLE sales_data;

D. Describing metadata or schema of the table


---------------------------------------------
describe sales_data;

alter table sales_data SET TBLPROPERTIES('EXTERNAL'='TRUE');

E. Load the data into the table


-------------------------------

F. Counting no of records
-------------------------
select count(*) from sales_data;

G. Counting total spending by product


-------------------------------------
select product, sum(price) from sales_data group by product;

H. 10 customers
--------------------
select custname, sum(price) from sales_data group by custname limit 10;

I. Create partitioned table


---------------------------

create external table sales_by_city (TransID INT, TransDate STRING,


Product STRING, Price DOUBLE, PaymentType STRING, CustName STRING, State
STRING, Country STRING) partitioned by (City STRING) row format delimited
fields terminated by ',' stored as textfile;

describe formatted sales_by_city;

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
from sales_data txn INSERT OVERWRITE TABLE sales_by_city PARTITION(city)
select txn.TransID,
txn.TransDate,txn.Product,txn.Price,txn.PaymentType,txn.CustName,txn.Stat
e,txn.Country,txn.city;

create table txnrecsByCity(txnno INT, txndate STRING, custno INT, amount


DOUBLE, product STRING, city STRING, state STRING, spendby STRING)
partitioned by (category STRING) clustered by (state) INTO 10 buckets row
format delimited fields terminated by ',' stored as textfile;

describe formatted txnrecsByCat;

J. Configure Hive to allow partitions


-------------------------------------

However, a query across all partitions could trigger an enormous


MapReduce job if the table data and number of partitions are large. A
highly suggested safety measure is putting Hive into strict mode, which
prohibits queries of partitioned tables without a WHERE clause that
filters on partitions. You can set the mode to nonstrict, as in the
following session:

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.enforce.bucketing=true;

K. Load data into partition table


----------------------------------
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;

from txnrecords txn INSERT OVERWRITE TABLE txnrecsByCat


PARTITION(category) select txn.txnno, txn.txndate,txn.custno,
txn.amount,txn.product,txn.city,txn.state, txn.spendby, txn.category
DISTRIBUTE BY category;

==========================
find sales based on age group
==========================

create table customer(custno string, firstname string, lastname string,


age int,profession string) row format delimited fields terminated by ',';

load data local inpath '/home/hdpuser/custs.txt' into table customer;

create table out1 (custno int,firstname string,age int,profession


string,amount double,product string) row format delimited fields
terminated by ',';
insert overwrite table out1 select
a.custno,a.firstname,a.age,a.profession,b.amount,b.product from customer
a JOIN txnrecords b ON a.custno = b.custno;

select * from out1 limit 100;

create table out2 (custno int,firstname string,age int,profession


string,amount double,product string, level string) row format delimited
fields terminated by ',';

insert overwrite table out2


select * , case
when age<30 then 'young'
when age>=30 and age < 50 then 'middle'
when age>=50 then 'old'
else 'others'
end
from out1;

select * from out2 limit 100;

describe out2;

create table out3 (level string, amount double) row format delimited
fields terminated by ',';

insert overwrite table out3 select level,sum(amount) from out2 group by


level;

select * from out3 limit 100;

==============
simple join
==============

create table employee(name string, salary float,city string) row format


delimited fields terminated by ',';

load data local inpath '/home/hdpuser/emp.txt' into table employee;

select * from employee where name='tarun';

create table mailid (name string, email string) row format delimited
fields terminated by ',';

load data local inpath '/home/hdpuser/email.txt' into table mailid;

select a.name,a.city,a.salary,b.email from employee a join mailid b on


a.name = b.name;

select a.name,a.city,a.salary,b.email from employee a left outer join


mailid b on a.name = b.name;
select a.name,a.city,a.salary,b.email from employee a right outer join
mailid b on a.name = b.name;

select a.name,a.city,a.salary,b.email from employee a full outer join


mailid b on a.name = b.name;

===============================================
Custom Mapper Code to manipulate unix timestamp
===============================================

CREATE TABLE u_data ( userid INT, movieid INT, rating INT, unixtime
STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS
TEXTFILE;

And load it into the table that was just created:

LOAD DATA LOCAL INPATH '/home/hdpuser/u.data.txt' OVERWRITE INTO TABLE


u_data;

Count the number of rows in table u_data:

SELECT COUNT(*) FROM u_data;

****Create weekday_mapper.py:

import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday =
datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])

CREATE TABLE u_data_new (userid INT, movieid INT, rating INT, weekday
INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

add FILE /home/hdpuser/weekday_mapper.py;

****Note that columns will be transformed to string and delimited


****by TAB before feeding to the user script, and the standard output
****of the user script will be treated as TAB-separated string columns.

****The following command uses the TRANSFORM clause to embed the mapper
scripts.

INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (userid, movieid,


rating, unixtime) USING 'python weekday_mapper.py' AS (userid, movieid,
rating, weekday) FROM u_data;

SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;

===========
UDF
===========

import java.util.Date;
import java.text.DateFormat;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class UnixtimeToDate extends UDF{
public Text evaluate(Text text){
if(text==null) return null;
long timestamp = Long.parseLong(text.toString());
return new Text(toDate(timestamp));
}
private String toDate(long timestamp) {
Date date = new Date (timestamp*1000);
return DateFormat.getInstance().format(date).toString();
}
}

/usr/bin/javac -classpath /usr/local/hadoop-


2.6.4/share/hadoop/common/hadoop-common-2.6.4.jar:/etc/hadoop/apache-
hive-0.13.0-bin/apache-hive-0.13.0-bin/lib/hive-exec-0.13.0.jar
UnixtimeToDate.java

****Pack this class file into a jar:


$/usr/bin/jar -cvf convert.jar UnixtimeToDate.class

****Verify jar using command :


$/usr/bin/jar -tvf convert.jar

****add this jar in hive prompt


ADD JAR /home/hdpuser/convert.jar;

****Then you create your custom function as follows:


create temporary function userdate as 'UnixtimeToDate';

****one,1386023259550
****two,1389523259550
****three,1389523259550
****four,1389523259550

create table testing(id string,id_time string) row format delimited


fields terminated by ',';

load data inpath '/data/counter' into table testing;

hive> select * from testing;


****OK
****one 1386023259550
****two 1389523259550
****three 1389523259550
****four 1389523259550

****Then use function 'userdate' in sql command


select id,userdate(id_time) from testing;

****OK
****four 3/28/02 8:12 PM
****one 4/30/91 1:59 PM
****two 3/28/02 8:12 PM
****three 3/28/02 8:12 PM

You might also like