March 11

/* Task dependencies */
/* #pragma omp task depend(in: <list> ) depend ( out: <list> ) depend ( inout: <list ) */
#pragma omp parallel

{
#pragma omp single
{ init
int x, y, z; x x
y = f(x) z = g(x)
#pragma omp task depend( out: x )
x = init();
#pragma omp task depend( in: x ) depend( out: y) init

y = f(x); x x
#pragma omp task depend( in: x ) depend( out: z) y = f(x) z = g(x)

z = g(x);
y z
#pragma omp task depend( in: y, z )
finalize( y, z )
finalize
}
}
So take any DAG and design dependencies for that…
• A, B, C can execute in parallel

• A, D can execute in parallel
• E must execute after A, D
• depend ( out: x )
A()
• depend ( out: y )
B()
• depend ( out: z )
C()
• depend ( in: y, z ) out ( w )

D()
• depend ( in: x, w )
E()
Matrix Multiplication using tasks
Already seen matrix multiplication. A x B = C
Blocked Algorithm (assume ‘n’ is multiple of ‘b’)
n x n matrix b – block size
int Cb [ n/b ] [ n/b ]
for( i = 0; i < n/b ; i++ )

for ( j=0 ; j < n/b ; j++ )
for ( k=0 ; k < n/b ; k++ )
#pragma omp task
block multiply ( i , k )th block of A with ( k , j )th block
of B and update ( i , j )th block of C
Here we are creating task for every (i, k) (k,j) to update (i, j)
K= 1,2,3…. Update ( i, j )
• Creating tasks out of every pairs of blocks in A & B, not for every C block
• So the same C block gets updated by multiple tasks
Hence race condition

To remove race condition:
int C [ n/b ] [ n/b ]

for( i = 0; i < n/b ; i++ )
for ( j=0 ; j < n/b ; j++ )
for ( k=0 ; k < n/b ; k++ )
#pragma omp task depend (out: C[ i ][ j ] )
block multiply ( i , k )th block of A with ( k , j )th block
of B and update ( i , j )th block of C
Locks
OpenMP Locks and Critical Section
/* Two different critical sections */
-----
int i, sum = 0, prod = 1;
double t1, t2 ;
----;
#pragma omp parallel default( shared )
{
int psum = 0 ; pprod = 1;
Partial sum & partial product
#pragma omp for
Two critical section, if 4 threads then
for ( i = 0 ; i < ARR_SIZE ; i++)
4 * 2 , 8 times serial execution.
{
psum += a[i];
can we have - one thread is updating
pprod *= a[i];
sum other thread updating prod?
}
#pragma omp critical

sum += psum;
At the end each thread update partial
sum and partial product, serially
prod += pprod;
}
-----
-----
{
----
#pragma omp for
for ( i = 0 ; i < ARR_SIZE ; i++) {
psum += a[i]; pprod *= a[i];
}
#pragma omp critical (section1)
{
printf( “In CS 1\n” );
for ( j = 0 ; j < 100000000; j++)
sum += psum;
printf( “Out CS 1\n” );
}
{
for ( j = 0 ; j < 100000000; j++)
prod += pprod;
}
}
-----
{
----
#pragma omp for
for ( i = 0 ; i < ARR_SIZE ; i++) {
psum += a[i]; pprod *= a[i];
}
{
for ( j = 0 ; j < 100000000; j++) It ensures that NO two
sum += psum; thread can enter a CS-1 or
printf( “Out CS 1\n” ); CS-2 at the same time but,
}
#pragma omp critical (section2) One in CS-1 and other in CS-
{ 2 at the same time is OK
for ( j = 0 ; j < 100000000; j++)
prod += pprod;
}
}
• Here we have given names to the CS
• What if we have CS, which are dependent on the number
of element or size of the data we have
• If its not static.
So the solution is Locks

• Lets have document as input with size 226 and histogram with
220 elements
• Huge document sets that trying to count the occurrence of
words
• Histogram of words and input size is the document
/* Histogram Updation */
#define INP_SIZE (1 <<26)
#include <omp.h> #define HIST_SIZE (1<<20)
#include <stdio.h>
int hist[HIST_SIZE] ;
int main( int *arg, char *argv[] ) int inp[INP_SIZE] ;
{
int i, key, sum=0; double t1, t2;
---- /* initialize inp to random values and hist entries to 0 */
t1 = omp_get_wtime();
#pragma omp parallel for private ( key )
for ( i = 0 ; i < INP_SIZE ; i++)
{
key = inp[i] ;
hist[key]++;
}
-----/* Add up hist entries in sum */
printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

-------
}
What are the issues
• Threads looking at every element, extract the key, go and update that
particular histogram entry
• If two different threads reads the same integer in the input, may be at
different locations, but if the value is same, they have same value of
key
• Then they will update the same histogram entry together – hence
race condition
/* Histogram Updation */
#define INP_SIZE (1 <<26)
#include <omp.h> #define HIST_SIZE (1<<20)
#include <stdio.h> int hist[HIST_SIZE] ;
int inp[INP_SIZE] ;
int main( int *arg, char *argv[] )
{
---- /* Initialize inp to random values and hist entries to 0 */
for ( i = 0 ; i < INP_SIZE ; i++)
{
key = inp[i] ; Sum=67108864. Time=2.93 (1 thread)
= 2 26
hist[key]++;
} Sum=67104683. Time=…. (4 threads)
t2 = omp_get_wtime(); (race condition)

-------
}
/* Histogram Updation with critical */
-----

{
/* Initialize inp to random values and hist entries to 0 */
for ( i = 0 ; i < INP_SIZE ; i++) { #define INP_SIZE (1 <<26)
key = inp [i] ; #define HIST_SIZE (1<<20)
int hist[HIST_SIZE] ;
int inp[INP_SIZE] ;
hist[key]++;
}
/* add up hist entries in sum */
}
/* Histogram Updation with critical */ #define INP_SIZE (1 <<26)
----- #define HIST_SIZE (1<<20)
int main( int *arg, char *argv[] ) int hist[HIST_SIZE] ;

{ int inp[INP_SIZE] ;
for ( i = 0 ; i < INP_SIZE ; i++) {
key = inp [i] ;

Sum=67108864. Time = 2.93 (sequential)
hist[key]++;
Sum=67108864. Time = 21.6153 (4 threads )
}
t2 = omp_get_wtime(); CS has overheads
Can we give names to CS
/* Add up hist entries in sum */
}
/* Histogram Updation with locks */
-----
omp_lock_t lock[HIST_SIZE] ;  lock variable of size 220
{
for ( i = 0 ; i < HIST_SIZE ; i++)
 initialize locks
omp_init_lock( &(lock[i]) );
---- /* Initialize inp to random values and hist entries to 0 */
t1 = omp_get_wtime(); 220 locks

for ( i = 0 ; i < INP_SIZE ; i++) { #define INP_SIZE (1 <<26)
key = inp [i] ; #define HIST_SIZE (1<<20)
omp_set_lock(&(lock[key]) );
hist[key]++; int hist[HIST_SIZE] ;
omp_unset_lock(&(lock[key]) ); int inp[INP_SIZE] ;
}
omp_destroy_lock( &(lock[i]) );  destroy locks
-------
}
-----
omp_lock_t lock[HIST_SIZE] ;  Lock variable of size 220 #define INP_SIZE (1 <<26)
int main( int *arg, char *argv[] ) #define HIST_SIZE (1<<20)
{ int hist[HIST_SIZE] ;
int i, key, sum=0; double t1, t2; int inp[INP_SIZE] ;
for ( i = 0 ; i < INP_SIZE ; i++) {  What if one thread gets the lock and
key = inp [i] ; another thread tries to get that lock?
It has to wait until that lock is not
hist[key]++; released by first thread - Blocking call
omp_unset_lock(&(lock[key]) );
}
 So for every histogram entry there is
t2 = omp_get_wtime(); a separate lock. This lock is indexed
for ( i = 0 ; i < HIST_SIZE ; i++) by the key.
omp_destroy_lock( &(lock[i]) );
 So any particular lock can be
}
acquired by one thread at a time
----- #define INP_SIZE (1 <<26)
omp_lock_t lock[HIST_SIZE] ; #define HIST_SIZE (1<<20)
{ int hist[HIST_SIZE] ;
int inp[INP_SIZE] ;
for ( i = 0 ; i < INP_SIZE ; i++) {
key = inp [i] ;
hist[key]++;
omp_unset_lock(&(lock[key]) );
}
Sum=67108864. Time=3.425 (4 threads)
Sum=67108864. Time=0.522617
omp_destroy_lock( &(lock[i]) ); (64 threads)
}
• Different threads are able to get their respective locks
simultaneously, CS was not allowing different threads to
enter that region of code
• Here different threads, if they are working on different
elements, at least they can enter the code
• So we get advantage of that.
Disadvantage:
• Locks takes a lot of memory. How many locks we are having.

• As many as number of histogram entry.
• So cache cannot be used, huge size of histogram entry
lock Advantage
lock Lock
lock Bucket • Take a block of entries called bucket and
associate a lock with that bucket, now
cache can be used
Bucket size fit • locks are 220

in the cache
Dis Advantage
• If two thread trying to update entry in

one bucket, then it has to wait
• They have to content for the same lock.
• Hence reduce parallelism

March 11

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

March 11

Uploaded by

Copyright:

Available Formats

/* Task dependencies */

#pragma omp parallel

#pragma omp task depend( in: x ) depend( out: y) init

#pragma omp task depend( in: x ) depend( out: z) y = f(x) z = g(x)

• A, B, C can execute in parallel

• depend ( in: y, z ) out ( w )

int Cb [ n/b ] [ n/b ]

for( i = 0; i < n/b ; i++ )

Hence race condition

int C [ n/b ] [ n/b ]

#pragma omp critical

So the solution is Locks

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

int main( int *arg, char *argv[] )

/* add up hist entries in sum */

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

int main( int *arg, char *argv[] ) int hist[HIST_SIZE] ;

#pragma omp critical

printf( “Sum=%d. Time=%d\n”, sum, t2 - t1) ;

t1 = omp_get_wtime(); 220 locks

• Locks takes a lot of memory. How many locks we are having.

Bucket size fit • locks are 220

• If two thread trying to update entry in

You might also like

int main( int arg, char argv[] )

int main( int arg, char argv[] ) int hist[HIST_SIZE] ;