You are on page 1of 7

1.

NYSE:

divs = load '/D:/Downloads D/NYSE_dividends.txt' as (exchange, symbol, date, dividend);

ranked = rank divs;


/* This statement will arrange the data in ascending order*/

dividends = filter ranked by (rank_divs > 1);

grouped = group dividends by symbol;

avg = foreach grouped generate group, AVG(dividends.dividend);

dump avg;

store avg into 'average_dividend';

top3 = foreach grouped {

sorted = order dividends by dividend desc;

top = limit sorted 3;

generate group, flatten(top);

};

/*

dump top3; /* dump command is used to see what is there in top3 */

NYSE Result
2.PIG Script:

a = load '/user/maria_dev/Data/Pig_Tutorial/A.int' using PigStorage(',') as (a1:int, a2:int, a3:int);

b = load '/user/maria_dev/Data/Pig_Tutorial/B.int' using PigStorage(',') as (b1:int, b2:int, b3:int);

-- The files required for analysis are loaded into relations a and b. The PigStorage() fucntions is used to
structure and separate the text file using delimiters, in this case comma(,) is used. The data has 3 fields,
all of which are integers.

DUMP a;

DUMP b;

c = UNION a, b;

DUMP c;

-- The union functions joins or merges the contents of two relations. The merged relation is stored into
c.

SPLIT c INTO d IF $0 == 0, e IF $0 == 1;

DUMP d;

DUMP e;
-- The value of the first field (positional notation - $0) of c is checked. If value is 0, the tuple is stored in d.
If the value is 1, the tuple is stored in e.

f = FILTER c BY $1 > 3;

DUMP f;

-- The values in the second field (Poistional notation - $1) are filtered. If the values are greater than 3,
the tuple is stored in relation f.

g = GROUP c BY $2;

DUMP g;

-- In relation g, the values are grouped by the values in the third field (positional notation - $2). The
tuples which have the values are grouped together.

DESCRIBE c;

DESCRIBE g;

--The describe function returns the schema of the relation, i.e. the field names and data type of each
field.

h = GROUP c ALL;

DUMP h;

-- In relation h, all the values are grouped together

i = FOREACH h GENERATE COUNT($1);

dump i; -- Count of tuples in h

-- The GENERATE COUNT () function returns the number of tuples in h, this result is stored in relation i.

j = COGROUP a BY $2, b BY $2;

DUMP j;

DESCRIBE j;

-- The Cogroup function is similar to group function, but can be used for multiple relations.
j = COGROUP a BY $2, b BY $2 INNER;

dump j;

-- This script returns values which are in common between the tables and values in relation b, grouped
by the third field.(Positional notation - $2)

j = COGROUP a BY $2 INNER, b BY $2 INNER;

dump j;

-- This script returns only values which are common in both the relations, grouped by the third field.

j = JOIN a BY $2, b BY $2;

dump j;

DESCRIBE j;

-- The join function merges the two relations into one but by common values of field 3 in both the
relations.

k = FOREACH c GENERATE a2, a2 * a3;

DUMP k;

--

k = FOREACH g GENERATE group, c.(a1,a2);

DUMP k;

cnt = FOREACH g GENERATE group, COUNT(c);

DUMP cnt;

k = FOREACH g GENERATE group, FLATTEN(c);

DUMP k;

DESCRIBE k;
A = LOAD '/user/maria_dev/Data/Pig_Tutorial/A.txt' using PigStorage(',') AS (id :int,type:chararray);

B = LOAD '/user/maria_dev/Data/Pig_Tutorial/B.txt' using PigStorage(',') AS (id :int,type:chararray);

DUMP A;

DUMP B;

INNER_JOIN = JOIN A BY id, B BY id; -- Only records with the same id in both datasets

DUMP INNER_JOIN;

LEFT_JOIN = JOIN A BY id LEFT, B BY id; -- Look up for matching records from other datasets

DUMP LEFT_JOIN;

RIGHT_JOIN = JOIN A BY id RIGHT, B BY id; -- All the data of the look-up table with only matching records
of the left table

DUMP RIGHT_JOIN;

FULL_JOIN = JOIN A BY id FULL, B BY id; -- All the matched and unmatched records out of both datasets

DUMP FULL_JOIN;

students = LOAD '/user/maria_dev/Data/Pig_Tutorial/students.txt' using PigStorage(',') as (id:int,


firstname:chararray, lastname:chararray, age:int, phone:chararray, city:chararray);

dump students;

age_group = GROUP students by age;

dump age_group;

3.WORD COUNT:
lines = LOAD ‘/ D:/Downloads D/’ AS (line:chararray);
/* This statement will load the file containing the data */

words = FOREACH lines GENERATE FLATTEN(TOKENIZE(line)) as word;


// Here we have created a output relation called words//

grouped = GROUP words BY word;

wordcount = FOREACH grouped GENERATE group, COUNT(words) as cnt;


//This statement will count how many times each word occurred//

topwords = ORDER wordcount BY DESC;


//This statement will give us the list of occurring words in descending order//

DUMP topwords;
// This statement will print the output of topwords//

Word count Result


4. TUTORIAL:
a = load ‘/D:/Downloads D/’A.int' using PigStorage(',') as (a1:int, a2:int, a3:int);
/* here we have load the data using the file in our system, which has 3 integers a1, a2 and a3 separated
by comma */

b = load ‘/ D:/Downloads D/’B.int' using PigStorage(',') as (b1:int, b2:int, b3:int);


/* here we have load the data using the file in our system, which has 3 integers b1, b2 and b3 separated
by comma */

DUMP a;

DUMP b;

c = UNION a, b;
/* This statement will combine relation a and b*/
DUMP c;

Tutorial Result

You might also like