Professional Documents
Culture Documents
Cascading - Bala's Blog
Cascading - Bala's Blog
POSTED BY
BALACHANDAR
POSTED ON
LEAVE A COMMENT
In this post, I am going to show an example with Cascading API to perform the inner join on two related
dataset
Assume that you have a professor and college details available in a separate XML files and you want to
combine both these details and want to generate the consolidated data.
<college>
<id>COL-100</id>
<name>Ohio State University</name>
<location>Ohio</location>
</college>
<professor>
<pid>PROF-100</pid>
<pfirstname>John</pfirstname>
<plastname>Turner</plastname>
<college>
<id>COL-100</id>
<name>Ohio State University</name>
<location>Ohio</location>
</college>
</professor>
I have created three Sub assemblies here, ProfessorDataAssembly to extract out the Professor data,
CollegeDataAssembly to extract out the College data and finally ProfessorCollegeJoinAssembly to
combine both these data and generate the consolidated XML with ProfessorCollegeBuffer
Please refer the below code to know how i have done it.
ProfessorCollegeDtailsJob.java
package com.cascading;
import cascading.flow.FlowDef;
import cascading.flow.local.LocalFlowConnector;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.local.TextLine;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.local.FileTap;
import cascading.tuple.Fields;
import com.cascading.assembly.ProfessorDataAssembly;
import com.cascading.assembly.ProfessorCollegeJoinAssembly;
import com.cascading.assembly.CollegeDataAssembly;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Properties;
/**
* This cascading job is used to read Professor and College details and do inner j
* create a consolidated XML which has professor and college information
*/
public class ProfessorCollegeDtailsJob {
if (args.length <= 0) {
LOGGER.info("Usage ProfessorCollegeDtailsJob <PROFESSOR XML FILE PATH>
"<COLLEGE XML FILE PATH> <OUTPUT FILE PATH>");
return;
}
//input paths & output path
String professorDataPath = args[0];
String collegeDataPath = args[1];
String outputPath = args[2];
LOGGER.info("professorDataPath:{}", professorDataPath);
LOGGER.info("studentDataPath:{}", collegeDataPath);
LOGGER.info("outputPath:{}", outputPath);
flowConnector.connect(flowDef).complete();
}
}
CollegeDataAssembly.java
package com.cascading.assembly;
import cascading.operation.xml.XPathParser;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
/**
* This class is used to extract out the college data information
*/
ProfessorDataAssembly.java
package com.cascading.assembly;
import cascading.operation.xml.XPathParser;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
/**
* This class is used to extract out the professor data information
*/
ProfessorCollegeJoinAssembly.java
package com.cascading.assembly;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.pipe.joiner.InnerJoin;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
/**
* This will do the inner job on professor and college data and create a consolidat
*
*/
public class ProfessorCollegeJoinAssembly extends SubAssembly {
@Override
public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
TupleEntry arguments = functionCall.getArguments();
if (arguments == null || arguments.getString(0) == null) {
return;
}
Iterator itr = arguments.getTuple().iterator();
while (itr.hasNext()) {
LOGGER.info((String) itr.next());
}
functionCall.getOutputCollector().add(arguments.getTuple());
}
}
}
ProfessorCollegeBuffer.java
package com.cascading.assembly;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Buffer;
import cascading.operation.BufferCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import java.util.Iterator;
/**
* Create the consolidated output xml
*/
@Override
public void prepare(FlowProcess flowProcess, OperationCall operationCall) {
BufferCall bufferCall = (BufferCall) operationCall;
bufferCall.setRetainValues(true);
}
@Override
public void operate(FlowProcess flowProcess, BufferCall bufferCall) {
Iterator iter = bufferCall.getArgumentsIterator();
Tuple innerTuple = new Tuple();
while (iter.hasNext()) {
TupleEntry entry = iter.next();
Tuple output = new Tuple();
output.add("<professor>");
output.add(entry.getString("pid"));
output.add(entry.getString("pfirstname"));
output.add(entry.getString("plastname"));
output.add("<college>");
output.add(entry.getString("collegeid"));
output.add(entry.getString("collegename"));
output.add(entry.getString("collegelocation"));
output.add("</college>");
output.add("</professor>");
innerTuple.add(output);
}
Tuple outputTuple = new Tuple();
outputTuple.add(innerTuple);
bufferCall.getOutputCollector().add(outputTuple);
}
}
POSTED BY
BALACHANDAR
POSTED ON
CASCADING, JAVA
COMMENTS
LEAVE A COMMENT
The below cascading job is used to count the empty tags in a XML file and exports the output to a text
file.
package com.cascading;
import cascading.flow.FlowDef;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.JDOMException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
//import cascading.scheme.local.TextDelimited;
//import cascading.scheme.local.TextLine;
/**
* Cascading job to count the number of empty tags found in a XML file.
*/
if (args.length <= 0) {
LOGGER.info("Usage CascadingEmptyTagCounter <INPUT> <OUTPUT> ");
return;
}
LOGGER.info("inputPath:{}", inputPath);
LOGGER.info("outputPath:{}", outputPath);
//Source and Sink Tap. Use Hfs. if you are testing in local, then use File
Tap inTap = new Hfs(new TextLine(new Fields("line")), inputPath);
Tap outTap = new Hfs(new TextDelimited(new Fields("line")), outputPath, Si
/**
* Cascading function to count the Empty tags
*/
public static class EmptyTagCounter extends BaseOperation implements Function
private static List tags = Arrays.asList("<sub />", "<sup />", "<b />", "<
@Override
public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
TupleEntry arguments = functionCall.getArguments();
if (arguments == null || arguments.getString(0) == null) {
return;
}
Tuple tuple = new Tuple();
try {
Map tagCountMap = getEmptyTagCounts(arguments.getTuple().getString
Set tagsSet = tagCountMap.entrySet();
StringBuilder tagCounter = new StringBuilder();
for (Map.Entry tagEntry : tagsSet) {
tagCounter.append(tagEntry.getKey() + "::" + tagEntry.getValue
}
tuple.add(tagCounter.toString());
} catch (JDOMException | IOException e) {
LOGGER.error("XML parsing error", e);
}
functionCall.getOutputCollector().add(tuple);
}
POSTED BY
BALACHANDAR
POSTED ON
CASCADING, HADOOP
COMMENTS
LEAVE A COMMENT
import cascading.flow.FlowDef;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
//import cascading.flow.local.LocalFlowConnector;
//import cascading.scheme.local.TextDelimited;
//import cascading.scheme.local.TextLine;
//import cascading.tap.local.FileTap;
/**
* Cascading job to replace the empty html tags found in a XML file
*/
if (args.length <= 0) {
LOGGER.info("Usage CascadingEmptyTagReplacer <INPUT> <OUTPUT>");
return;
}
//input path & output path
String inputPath = args[0];
String outputPath = args[1];
LOGGER.info("inputPath:{}", inputPath);
LOGGER.info("outputPath:{}", outputPath);
//Source and Sink Tap. Use Hfs. if you are testing in local, then use File
Tap inTap = new Hfs(new TextLine(new Fields("line")), inputPath);
Tap outTap = new Hfs(new TextDelimited(new Fields("line")), outputPath, Si
/**
* Custom Function to replace Empty tags in the XML content
*/
public static class EmptyTagReplacer extends BaseOperation implements Function
@Override
public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
TupleEntry arguments = functionCall.getArguments();
if (arguments == null || arguments.getString(0) == null) {
return;
}
Tuple tuple = new Tuple();
String xmlData = arguments.getTuple().getString(0);
for (String tag : tags) {
xmlData = xmlData.replace(tag, "");
}
tuple.add(xmlData);
functionCall.getOutputCollector().add(tuple);
}
}
}
POSTED BY
BALACHANDAR
POSTED ON
CASCADING
COMMENTS
LEAVE A COMMENT
Cascading SubAssemblies are reusable pipe assemblies that are linked into larger pipe assemblies. Think
of them as subroutines in a programming language. This helps commonly used pipe assemblies to be
packaged into libraries and those can be used in the complex flow
Read a tsv[tab separated text file] file which contains the user name, age and dept details. Assume that we want to
remove the users whose age is more than or equals to 30. Then group by deptId and output the deptId and count in
a tsv file
This is a simple job for only example purpose. In the real scenario, we may end up with different
problems.
Refer the below SubAssembly code. Here we have separated out the process, so this will take care of
reading the content from the Tap and filter it, group by and finally write output to sink. Please make sure
that we are not hardcoding anything. So we can use any kind of source as long as it satisfies valid
format. same for Sink also.
Creating this kind of SubAssemblies will help us to use this wherever we want.
package com.cascading;
import cascading.operation.aggregator.Count;
import cascading.operation.expression.ExpressionFilter;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
public UserSubAssembly() {
this(new Pipe(INPUT_PIPE_NAME), OUTPUT_PIPE_NAME);
}
super();
Pipe pipe = new Each(input, new Fields("age"),
new ExpressionFilter("age >= 30", Integer.TYPE));
pipe = new GroupBy(pipe, new Fields("deptId"));
pipe = new Every(pipe, new Count());
pipe = new Pipe(tailName, pipe);
setTails(pipe);
}
}
import cascading.flow.Flow;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
/**
* A Cascading example to read a tsv file file which contains user name, age and d
* and do group by deptId and output the deptId and count in a tsv file
*/
public class Main {
/**
* This examples uses SubAssembly. ExpressFilter and TapsMap
*
* @param args
*/
public static void main(String[] args) {
//SubAssembly
Pipe pipe = new UserSubAssembly();
POSTED BY
BALACHANDAR
POSTED ON
CASCADING
COMMENTS
LEAVE A COMMENT
A Cascading Job to read a text file which contains user name and age details and remove the users
whose age is more than or equals to 30 and also print the content in an output file with some predefined
expression which uses expression filter and Expression function
package com.cascading;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.expression.ExpressionFilter;
import cascading.operation.expression.ExpressionFunction;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* A Cascading example to read a text file which contains user name and age detail
* and also print the content in an output file with some predefined expression
*/
public class Main {
/**
* This examples uses ExpressionFilter and ExpressionFunction function
*
* @param args
*/
public static void main(String[] args) {
POSTED BY
BALACHANDAR
POSTED ON
CASCADING
COMMENTS
LEAVE A COMMENT
A cascading job which uses Custom filter and filter the data
package com.cascading;
import cascading.flow.FlowDef;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.BaseOperation;
import cascading.operation.Filter;
import cascading.operation.FilterCall;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import java.util.Properties;
/**
* A Cascading example to read a text file which contains user name and age detail
*/
public class Main {
/**
* This custom filter will remove all the users whose age is more than or equa
*/
public static class CustomFilter extends BaseOperation implements Filter {
private static final long serialVersionUID = 1L;
@Override
public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
TupleEntry arguments = filterCall.getArguments();
String age = arguments.getString(1).trim();
return Integer.valueOf(age) >= 30;
}
}
}
POSTED BY
BALACHANDAR
POSTED ON
CASCADING
COMMENTS
LEAVE A COMMENT
A cascading job which uses Custom Funcion and filter the data
package com.cascading;
import cascading.flow.Flow;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import java.util.Properties;
/**
* A Cascading example to read a text file and convert the string to upper case let
*/
public class Main {
@Override
public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
TupleEntry arguments = functionCall.getArguments();
Tuple tuple = new Tuple();
if (arguments == null || arguments.getString(0) == null) {
return;
}
String original = arguments.getString(0).trim();
tuple.add(original.toUpperCase());
functionCall.getOutputCollector().add(tuple);
}
}
}
Bala's Blog
Blog at WordPress.com.