You are on page 1of 21

/* Task dependencies */

/* #pragma omp task depend(in: <list> ) depend ( out: <list> ) depend ( inout: <list ) */

#pragma omp parallel


{
#pragma omp single
{ init
int x, y, z; x x
y = f(x) z = g(x)
#pragma omp task depend( out: x )
x = init();

#pragma omp task depend( in: x ) depend( out: y) init


y = f(x); x x

#pragma omp task depend( in: x ) depend( out: z) y = f(x) z = g(x)


z = g(x);
y z
#pragma omp task depend( in: y, z )
finalize( y, z )
finalize
}
}
So take any DAG and design dependencies for that…

• A, B, C can execute in parallel


• A, D can execute in parallel
• E must execute after A, D

• depend ( out: x )
A()

• depend ( out: y )
B()

• depend ( out: z )
C()

• depend ( in: y, z ) out ( w )


D()

• depend ( in: x, w )
E()
Matrix Multiplication using tasks
Already seen matrix multiplication. A x B = C
Blocked Algorithm (assume ‘n’ is multiple of ‘b’)
n x n matrix b – block size

int Cb [ n/b ] [ n/b ]

for( i = 0; i < n/b ; i++ )


for ( j=0 ; j < n/b ; j++ )
for ( k=0 ; k < n/b ; k++ )
#pragma omp task
block multiply ( i , k )th block of A with ( k , j )th block
of B and update ( i , j )th block of C

Here we are creating task for every (i, k) (k,j) to update (i, j)
K= 1,2,3…. Update ( i, j )
• Creating tasks out of every pairs of blocks in A & B, not for every C block
• So the same C block gets updated by multiple tasks

Hence race condition


To remove race condition:

int C [ n/b ] [ n/b ]


for( i = 0; i < n/b ; i++ )
for ( j=0 ; j < n/b ; j++ )
for ( k=0 ; k < n/b ; k++ )
#pragma omp task depend (out: C[ i ][ j ] )
block multiply ( i , k )th block of A with ( k , j )th block
of B and update ( i , j )th block of C
Locks
OpenMP Locks and Critical Section
/* Two different critical sections */
-----
int i, sum = 0, prod = 1;
double t1, t2 ;
----;
#pragma omp parallel default( shared )
{
int psum = 0 ; pprod = 1;
Partial sum & partial product
#pragma omp for
Two critical section, if 4 threads then
for ( i = 0 ; i < ARR_SIZE ; i++)
4 * 2 , 8 times serial execution.
{
psum += a[i];
can we have - one thread is updating
pprod *= a[i];
sum other thread updating prod?
}

#pragma omp critical


sum += psum;
At the end each thread update partial
#pragma omp critical
sum and partial product, serially
prod += pprod;

}
-----
/* Two different critical sections */
-----
#pragma omp parallel default( shared )
{
----
#pragma omp for
for ( i = 0 ; i < ARR_SIZE ; i++) {
psum += a[i]; pprod *= a[i];
}
#pragma omp critical (section1)
{
printf( “In CS 1\n” );
for ( j = 0 ; j < 100000000; j++)
sum += psum;
printf( “Out CS 1\n” );
}
#pragma omp critical (section2)
{
printf( “In CS 2\n” );
for ( j = 0 ; j < 100000000; j++)
prod += pprod;
printf( “Out CS 2\n” );
}
}
/* Two different critical sections */
-----
#pragma omp parallel default( shared )
{
----
#pragma omp for
for ( i = 0 ; i < ARR_SIZE ; i++) {
psum += a[i]; pprod *= a[i];
}
#pragma omp critical (section1)
{
printf( “In CS 1\n” );
for ( j = 0 ; j < 100000000; j++) It ensures that NO two
sum += psum; thread can enter a CS-1 or
printf( “Out CS 1\n” ); CS-2 at the same time but,
}
#pragma omp critical (section2) One in CS-1 and other in CS-
{ 2 at the same time is OK
printf( “In CS 2\n” );
for ( j = 0 ; j < 100000000; j++)
prod += pprod;
printf( “Out CS 2\n” );
}
}
• Here we have given names to the CS
• What if we have CS, which are dependent on the number
of element or size of the data we have
• If its not static.

So the solution is Locks


• Lets have document as input with size 226 and histogram with
220 elements
• Huge document sets that trying to count the occurrence of
words
• Histogram of words and input size is the document
/* Histogram Updation */
#define INP_SIZE (1 <<26)
#include <omp.h> #define HIST_SIZE (1<<20)
#include <stdio.h>
int hist[HIST_SIZE] ;
int main( int *arg, char *argv[] ) int inp[INP_SIZE] ;
{
int i, key, sum=0; double t1, t2;
---- /* initialize inp to random values and hist entries to 0 */

t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++)
{
key = inp[i] ;
hist[key]++;
}
t2 = omp_get_wtime();
-----/* Add up hist entries in sum */

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;


-------
}
What are the issues

• Threads looking at every element, extract the key, go and update that
particular histogram entry
• If two different threads reads the same integer in the input, may be at
different locations, but if the value is same, they have same value of
key
• Then they will update the same histogram entry together – hence
race condition
/* Histogram Updation */
#define INP_SIZE (1 <<26)
#include <omp.h> #define HIST_SIZE (1<<20)
#include <stdio.h> int hist[HIST_SIZE] ;
int inp[INP_SIZE] ;
int main( int *arg, char *argv[] )
{
int i, key, sum=0; double t1, t2;
---- /* Initialize inp to random values and hist entries to 0 */

t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++)
{
key = inp[i] ; Sum=67108864. Time=2.93 (1 thread)
= 2 26
hist[key]++;
} Sum=67104683. Time=…. (4 threads)
t2 = omp_get_wtime(); (race condition)
-----/* Add up hist entries in sum */

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;


-------
}
/* Histogram Updation with critical */
-----

int main( int *arg, char *argv[] )


{
int i, key, sum=0; double t1, t2;
/* Initialize inp to random values and hist entries to 0 */

t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++) { #define INP_SIZE (1 <<26)
key = inp [i] ; #define HIST_SIZE (1<<20)

int hist[HIST_SIZE] ;
#pragma omp critical
int inp[INP_SIZE] ;
hist[key]++;
}
t2 = omp_get_wtime();

/* add up hist entries in sum */

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

}
/* Histogram Updation with critical */ #define INP_SIZE (1 <<26)
----- #define HIST_SIZE (1<<20)

int main( int *arg, char *argv[] ) int hist[HIST_SIZE] ;


{ int inp[INP_SIZE] ;
int i, key, sum=0; double t1, t2;
/* Initialize inp to random values and hist entries to 0 */

t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++) {
key = inp [i] ;

#pragma omp critical


Sum=67108864. Time = 2.93 (sequential)
hist[key]++;
Sum=67108864. Time = 21.6153 (4 threads )
}
t2 = omp_get_wtime(); CS has overheads
Can we give names to CS
/* Add up hist entries in sum */

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

}
/* Histogram Updation with locks */
-----
omp_lock_t lock[HIST_SIZE] ;  lock variable of size 220
int main( int *arg, char *argv[] )
{
int i, key, sum=0; double t1, t2;
for ( i = 0 ; i < HIST_SIZE ; i++)
 initialize locks
omp_init_lock( &(lock[i]) );
---- /* Initialize inp to random values and hist entries to 0 */

t1 = omp_get_wtime(); 220 locks


#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++) { #define INP_SIZE (1 <<26)
key = inp [i] ; #define HIST_SIZE (1<<20)
omp_set_lock(&(lock[key]) );
hist[key]++; int hist[HIST_SIZE] ;
omp_unset_lock(&(lock[key]) ); int inp[INP_SIZE] ;
}
t2 = omp_get_wtime();
for ( i = 0 ; i < HIST_SIZE ; i++)
omp_destroy_lock( &(lock[i]) );  destroy locks
-----/* Add up hist entries in sum */
printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;
-------
}
/* Histogram Updation with locks */
-----
omp_lock_t lock[HIST_SIZE] ;  Lock variable of size 220 #define INP_SIZE (1 <<26)
int main( int *arg, char *argv[] ) #define HIST_SIZE (1<<20)
{ int hist[HIST_SIZE] ;
int i, key, sum=0; double t1, t2; int inp[INP_SIZE] ;
for ( i = 0 ; i < HIST_SIZE ; i++)
omp_init_lock( &(lock[i]) );
/* Initialize inp to random values and hist entries to 0 */
t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++) {  What if one thread gets the lock and
key = inp [i] ; another thread tries to get that lock?
omp_set_lock(&(lock[key]) );
It has to wait until that lock is not
hist[key]++; released by first thread - Blocking call
omp_unset_lock(&(lock[key]) );
}
 So for every histogram entry there is
t2 = omp_get_wtime(); a separate lock. This lock is indexed
for ( i = 0 ; i < HIST_SIZE ; i++) by the key.
omp_destroy_lock( &(lock[i]) );
/* Add up hist entries in sum */
printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;
 So any particular lock can be
}
acquired by one thread at a time
/* Histogram Updation with locks */
----- #define INP_SIZE (1 <<26)
omp_lock_t lock[HIST_SIZE] ; #define HIST_SIZE (1<<20)
int main( int *arg, char *argv[] )
{ int hist[HIST_SIZE] ;
int inp[INP_SIZE] ;
int i, key, sum=0; double t1, t2;
for ( i = 0 ; i < HIST_SIZE ; i++)
omp_init_lock( &(lock[i]) );
/* Initialize inp to random values and hist entries to 0 */
t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++) {
key = inp [i] ;
omp_set_lock(&(lock[key]) );
hist[key]++;
omp_unset_lock(&(lock[key]) );
}
t2 = omp_get_wtime();
Sum=67108864. Time=3.425 (4 threads)
for ( i = 0 ; i < HIST_SIZE ; i++)
Sum=67108864. Time=0.522617
omp_destroy_lock( &(lock[i]) ); (64 threads)
/* Add up hist entries in sum */
printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

}
• Different threads are able to get their respective locks
simultaneously, CS was not allowing different threads to
enter that region of code
• Here different threads, if they are working on different
elements, at least they can enter the code
• So we get advantage of that.

Disadvantage:

• Locks takes a lot of memory. How many locks we are having.


• As many as number of histogram entry.
• So cache cannot be used, huge size of histogram entry
lock Advantage
lock Lock
lock Bucket • Take a block of entries called bucket and
associate a lock with that bucket, now
cache can be used

Bucket size fit • locks are 220


in the cache

Dis Advantage

• If two thread trying to update entry in


one bucket, then it has to wait
• They have to content for the same lock.
• Hence reduce parallelism

You might also like