You are on page 1of 1

Example 8-16.

Comparing RDDs with order


/**
* Asserts two RDDs are equal (with the same order).
* If they are equal assertion succeeds, otherwise assertion fails.
*/
def assertRDDEqualsWithOrder[T: ClassTag](
expected: RDD[T], result: RDD[T]): Unit = {
assertTrue(compareRDDWithOrder(expected, result).isEmpty)
}

/**
* Compare two RDDs with order (e.g. [1,2,3] != [3,2,1])
* If the partitioners are not the same this requires multiple passes
* on the input.
* If they are equal returns None, otherwise returns Some with the first mismatch.
* If the lengths are not equal, one of the two components may be None.
*/
def compareRDDWithOrder[T: ClassTag](
expected: RDD[T], result: RDD[T]): Option[(Option[T], Option[T])] = {
// If there is a known partitioner just zip
if (result.partitioner.map(_ == expected.partitioner.get).getOrElse(false)) {
compareRDDWithOrderSamePartitioner(expected, result)
} else {
// Otherwise index every element
def indexRDD[T](rdd: RDD[T]): RDD[(Long, T)] = {
rdd.zipWithIndex.map { case (x, y) => (y, x) }
}
val indexedExpected = indexRDD(expected)
val indexedResult = indexRDD(result)
indexedExpected.cogroup(indexedResult).filter { case (_, (i1, i2)) =>
i1.isEmpty || i2.isEmpty || i1.head != i2.head
}.take(1).headOption.
map { case (_, (i1, i2)) =>
(i1.headOption, i2.headOption) }.take(1).headOption
}
}

/**
* Compare two RDDs. If they are equal returns None, otherwise
* returns Some with the first mismatch. Assumes we have the same partitioner.
*/
def compareRDDWithOrderSamePartitioner[T: ClassTag](
expected: RDD[T], result: RDD[T]): Option[(Option[T], Option[T])] = {
// Handle mismatched lengths by converting into options and padding with Nones
expected.zipPartitions(result) {
(thisIter, otherIter) =>
new Iterator[(Option[T], Option[T])] {
def hasNext: Boolean = (thisIter.hasNext || otherIter.hasNext)

def next(): (Option[T], Option[T]) = {


(thisIter.hasNext, otherIter.hasNext) match {

212 | Chapter 8: Testing and Validation

You might also like