Professional Documents
Culture Documents
NYSE:
dump avg;
};
/*
NYSE Result
2.PIG Script:
-- The files required for analysis are loaded into relations a and b. The PigStorage() fucntions is used to
structure and separate the text file using delimiters, in this case comma(,) is used. The data has 3 fields,
all of which are integers.
DUMP a;
DUMP b;
c = UNION a, b;
DUMP c;
-- The union functions joins or merges the contents of two relations. The merged relation is stored into
c.
SPLIT c INTO d IF $0 == 0, e IF $0 == 1;
DUMP d;
DUMP e;
-- The value of the first field (positional notation - $0) of c is checked. If value is 0, the tuple is stored in d.
If the value is 1, the tuple is stored in e.
f = FILTER c BY $1 > 3;
DUMP f;
-- The values in the second field (Poistional notation - $1) are filtered. If the values are greater than 3,
the tuple is stored in relation f.
g = GROUP c BY $2;
DUMP g;
-- In relation g, the values are grouped by the values in the third field (positional notation - $2). The
tuples which have the values are grouped together.
DESCRIBE c;
DESCRIBE g;
--The describe function returns the schema of the relation, i.e. the field names and data type of each
field.
h = GROUP c ALL;
DUMP h;
-- The GENERATE COUNT () function returns the number of tuples in h, this result is stored in relation i.
DUMP j;
DESCRIBE j;
-- The Cogroup function is similar to group function, but can be used for multiple relations.
j = COGROUP a BY $2, b BY $2 INNER;
dump j;
-- This script returns values which are in common between the tables and values in relation b, grouped
by the third field.(Positional notation - $2)
dump j;
-- This script returns only values which are common in both the relations, grouped by the third field.
dump j;
DESCRIBE j;
-- The join function merges the two relations into one but by common values of field 3 in both the
relations.
DUMP k;
--
DUMP k;
DUMP cnt;
DUMP k;
DESCRIBE k;
A = LOAD '/user/maria_dev/Data/Pig_Tutorial/A.txt' using PigStorage(',') AS (id :int,type:chararray);
DUMP A;
DUMP B;
INNER_JOIN = JOIN A BY id, B BY id; -- Only records with the same id in both datasets
DUMP INNER_JOIN;
LEFT_JOIN = JOIN A BY id LEFT, B BY id; -- Look up for matching records from other datasets
DUMP LEFT_JOIN;
RIGHT_JOIN = JOIN A BY id RIGHT, B BY id; -- All the data of the look-up table with only matching records
of the left table
DUMP RIGHT_JOIN;
FULL_JOIN = JOIN A BY id FULL, B BY id; -- All the matched and unmatched records out of both datasets
DUMP FULL_JOIN;
dump students;
dump age_group;
3.WORD COUNT:
lines = LOAD ‘/ D:/Downloads D/’ AS (line:chararray);
/* This statement will load the file containing the data */
DUMP topwords;
// This statement will print the output of topwords//
DUMP a;
DUMP b;
c = UNION a, b;
/* This statement will combine relation a and b*/
DUMP c;
Tutorial Result