You are on page 1of 9

Parallelization techniques

Skill 3: vectorization

single data multiple data

multiple
- MIMD
instructions

single instruction SISD SIMD

2
SISD MIMD SIMD

simple complex simple


standard powerful powerful
more specialized

SISD MIMD SIMD


data of same type
consecutive data in memory
large data path
simple pipeline (no dependency)
low transistor
simple cost
 complex simple
(unitstandard
duplication) powerful powerful
more specialized

4
simpleSISD MIMD bank)
to add (special register SIMD

lower transistor cost than MIMD


low energy consumption
moderate thermal envelope

simple complex simple


=> low on resources, energy
standard powerful powerful
and cooling
more specialized

a a1 a2 a3 a4

b b1 b2 b3 b4

+ 1 instruction + + + +

c c1 c2 c3 c4

6
Altivec :
ARM NEON :
PowerPC G4, PowerPC
ARM v5, ARM v6, Cortex
G5, Cell processor
Mobile phones, tablets,
Playstation 3, Wii, XBox
HDTV sets
360, TV sets

portability ?…
MMX, 3DNow!, SSE, …
x86 (Intel, AMD, Via, …)
Playstation 4, XBox One, PC,
tablets, mobile phones…
7

SSE2
XMM data format

__m128i integer format (8 to 64 b)

__m128 __m128d

four two
32 bits FP 64 bits FP

8
SSE2
generic instruction format
_mm_operator_format
format : {p, s}{s, d} => {packed, scalar}{single, double}
ex. : ps : packed single ; sd : scalar double
{ , e}{p, s}{i, u}{8, 16, 32, 64, 128} => {MMX,
XMM}, {packed, scalar}, {integer, unsigned} + size
ex. : pi8 : 8 MMX bytes ; epi32 : 4 32 bits XMM integers
note : si64 & si128 : entire register only

operator : name of the assembly instruction


9

logic instructions

_mm_and_si128, _mm_andnot_si128,
_mm_or_si128, _mm_xor_si128

_mm_andnot_si128:

a = a and not(b)

a = not(a) and b

10
transverse 8 & 16 bits operations

_mm_sad_epu8, _mm_madd_epi16

- - - - - - - - - - - - - - - -

* * * * * * * *
|| || || || || || || || || || || || || || || ||

+ + + + + +
0 0 0 0 0 0 0 0 0 0 0 0

_mm_sad_epu8 _mm_madd_epi16
11

integer comparison instructions

{_mm_cmpeq-lt-gt}_epi{8, 16, 32}


creates a mask: ex.: 1…1 0…0 1…1 1…1 0…0 1…1 1…1 0…0

_mm_movemask_epi8 0 0 … … 16b mask of 0 or 1

there are many other SSE2,


SSE3, … instructions (see the .h files), and
now even AVX, AVX2, etc.
12
floating point instructions

{_mm_add, _mm_sub, _mm_mul, _mm_div,


_mm_sqrt, _mm_max, _mm_min}_{p, s}{s, d}

a*1/b ≈ a/b
{_mm_rcp, _mm_rsqrt}_{p, s}{s}
a*1/√a ≈ √a

{_mm_and, _mm_andnot, _mm_or, _mm_xor}


_p{s, d}

13

Our example:
#include <stdio.h>

// Define the data types once in a header file...


typedef float vectorFloat4 __attribute__((__vector_size__(16), __aligned__(16)));
typedef int vectorInt4 __attribute__((__vector_size__(16), __aligned__(16)));

// ... then use the data types in your own code


int main(void)
{
vectorFloat4 a = {1.f, 2.f, 3.f, 4.f};
vectorFloat4 b = {5.f, 6.f, 7.f, 8.f};
vectorFloat4 c = {9.f, 10.f, 11.f, 12.f};
vectorFloat4 d = a*b+c;

printf ("%f, %f, %f, %f\n", d[0], d[1], d[2], d[3]);

vectorInt4 e = d >= (vectorFloat4){20.f, 20.f, 20.f, 20.f};

printf ("%x, %x, %x, %x\n", e[0], e[1], e[2], e[3]);

return 0;
}

14
#include <stdio.h>
#include <emmintrin.h>

int main(void)
{
union
{
short entier[8];
__m128i xmm;
} a = {1, 2, 3, 4, 5, 6, 7, 8}, b = {9, -10, 11, -12, 13, -14, 15, -16};
union
{
__m128i xmm;
int entier[4];
} c, d;

__m128i F = _mm_mulhi_epi16(a.xmm, b.xmm);


__m128i f = _mm_mullo_epi16(a.xmm, b.xmm);
c.xmm = _mm_unpacklo_epi16(f, F);
d.xmm = _mm_unpackhi_epi16(f, F);

printf ("%d, %d, %d, %d, ", c.entier[0], c.entier[1], c.entier[2], c.entier[3]);
printf ("%d, %d, %d, %d\n", d.entier[0], d.entier[1], d.entier[2], d.entier[3]);

return 0;
}

15

_mm_mulhi_epi16 _mm_mullo_epi16
a 8 7 6 5 4 3 2 1 8 7 6 5 4 3 2 1 a
* * * * * * * * * * * * * * * *
b -16 15 -14 13 -12 11 -10 9 -16 15 -14 13 -12 11 -10 9 b
F h h h h h h h h l l l l l l l l f

c h l h l h l h l h l h l h l h l d
_mm_unpacklo_epi16 _mm_unpackhi_epi16
very similar

c -48 33 -20 9 -128 105 -84 65 d


[3] [2] [1] [0] [3] [2] [1] [0]

16
...
static const char Zero[16] = {0};

int vstrlen(const char *string16)


{
int size = 0;
int mask;
for(;;)
{
__m128i temp = _mm_cmpeq_epi8(*(__m128i*)(string16+size), *(__m128i*)Zero);
mask = _mm_movemask_epi8(temp);
if (mask != 0)
break;
size += 16;
}
// We want the real size !!! (and not a multiple of 16)
// so we must modify 'size' according to the content of mask...
#error write your code here and remove this line
return size;
}

17

...
static const char Zero[16] = {0};

int vstrlen(const char *string16)


{
int size = 0;
int mask;
for(;;)
{
__m128i temp = _mm_cmpeq_epi8(*(__m128i*)(string16+size), *(__m128i*)Zero);
mask = _mm_movemask_epi8(temp);
if (mask != 0)
?break;
? ? 0 ? ? ? ? ? ? 0 ? ? ? ? ?
size += 16;
}
// We0want0 the0real0 size
0 !!!0 (and
0 not0 a multiple
0 0 of 0 16)0 0 0 0 0
// so we must modify 'size' according to the content of mask...

_mm_cmpeq_epi8
#error write your code here and remove this line
return size;
}
0 0 0 FF 0 0 0 0 0 0 FF 0 0 0 0 0
_mm_movemask_epi8
32 b mask 0…0 0001 0000 0010 0000

extract the position 18

You might also like