Parallelization Techniques: Skill 3: Vectorization

Parallelization techniques
Skill 3: vectorization
single data multiple data
multiple
- MIMD
instructions
single instruction SISD SIMD
2
SISD MIMD SIMD
simple complex simple

standard powerful powerful
more specialized
SISD MIMD SIMD

data of same type
consecutive data in memory
large data path
simple pipeline (no dependency)
low transistor
simple cost  complex simple
(unitstandard
duplication) powerful powerful
more specialized
4
simpleSISD MIMD bank)
to add (special register SIMD
lower transistor cost than MIMD

low energy consumption
moderate thermal envelope
simple complex simple

=> low on resources, energy
standard powerful powerful
and cooling
more specialized
a a1 a2 a3 a4
b b1 b2 b3 b4
+ 1 instruction + + + +
c c1 c2 c3 c4
6
Altivec :
ARM NEON :
PowerPC G4, PowerPC
ARM v5, ARM v6, Cortex
G5, Cell processor
Mobile phones, tablets,
Playstation 3, Wii, XBox
HDTV sets
360, TV sets
portability ?…
MMX, 3DNow!, SSE, …
x86 (Intel, AMD, Via, …)
Playstation 4, XBox One, PC,
tablets, mobile phones…
7
SSE2
XMM data format
__m128i integer format (8 to 64 b)
__m128 __m128d
four two
32 bits FP 64 bits FP
8
SSE2
generic instruction format
_mm_operator_format
format : {p, s}{s, d} => {packed, scalar}{single, double}
ex. : ps : packed single ; sd : scalar double
{ , e}{p, s}{i, u}{8, 16, 32, 64, 128} => {MMX,
XMM}, {packed, scalar}, {integer, unsigned} + size
ex. : pi8 : 8 MMX bytes ; epi32 : 4 32 bits XMM integers
note : si64 & si128 : entire register only
operator : name of the assembly instruction

9
logic instructions
_mm_and_si128, _mm_andnot_si128,
_mm_or_si128, _mm_xor_si128
_mm_andnot_si128:
a = a and not(b)
a = not(a) and b
10
transverse 8 & 16 bits operations
_mm_sad_epu8, _mm_madd_epi16
- - - - - - - - - - - - - - - -
* * * * * * * *
|| || || || || || || || || || || || || || || ||
+ + + + + +
0 0 0 0 0 0 0 0 0 0 0 0
_mm_sad_epu8 _mm_madd_epi16
11
integer comparison instructions
{_mm_cmpeq-lt-gt}_epi{8, 16, 32}

creates a mask: ex.: 1…1 0…0 1…1 1…1 0…0 1…1 1…1 0…0
_mm_movemask_epi8 0 0 … … 16b mask of 0 or 1
there are many other SSE2,

SSE3, … instructions (see the .h files), and
now even AVX, AVX2, etc.
12
floating point instructions
{_mm_add, _mm_sub, _mm_mul, _mm_div,

_mm_sqrt, _mm_max, _mm_min}_{p, s}{s, d}
a*1/b ≈ a/b
{_mm_rcp, _mm_rsqrt}_{p, s}{s}
a*1/√a ≈ √a
{_mm_and, _mm_andnot, _mm_or, _mm_xor}

_p{s, d}
13
Our example:
#include <stdio.h>
// Define the data types once in a header file...

typedef float vectorFloat4 __attribute__((__vector_size__(16), __aligned__(16)));
typedef int vectorInt4 __attribute__((__vector_size__(16), __aligned__(16)));
// ... then use the data types in your own code

int main(void)
{
vectorFloat4 a = {1.f, 2.f, 3.f, 4.f};
vectorFloat4 b = {5.f, 6.f, 7.f, 8.f};
vectorFloat4 c = {9.f, 10.f, 11.f, 12.f};
vectorFloat4 d = a*b+c;
printf ("%f, %f, %f, %f\n", d[0], d[1], d[2], d[3]);
vectorInt4 e = d >= (vectorFloat4){20.f, 20.f, 20.f, 20.f};
printf ("%x, %x, %x, %x\n", e[0], e[1], e[2], e[3]);
return 0;
}
14
#include <stdio.h>
#include <emmintrin.h>
int main(void)
{
union
{
short entier[8];
__m128i xmm;
} a = {1, 2, 3, 4, 5, 6, 7, 8}, b = {9, -10, 11, -12, 13, -14, 15, -16};
union
{
__m128i xmm;
int entier[4];
} c, d;
__m128i F = _mm_mulhi_epi16(a.xmm, b.xmm);

__m128i f = _mm_mullo_epi16(a.xmm, b.xmm);
c.xmm = _mm_unpacklo_epi16(f, F);
d.xmm = _mm_unpackhi_epi16(f, F);
printf ("%d, %d, %d, %d, ", c.entier[0], c.entier[1], c.entier[2], c.entier[3]);
printf ("%d, %d, %d, %d\n", d.entier[0], d.entier[1], d.entier[2], d.entier[3]);
return 0;
}
15
_mm_mulhi_epi16 _mm_mullo_epi16
a 8 7 6 5 4 3 2 1 8 7 6 5 4 3 2 1 a
* * * * * * * * * * * * * * * *
b -16 15 -14 13 -12 11 -10 9 -16 15 -14 13 -12 11 -10 9 b
F h h h h h h h h l l l l l l l l f
c h l h l h l h l h l h l h l h l d
_mm_unpacklo_epi16 _mm_unpackhi_epi16
very similar
c -48 33 -20 9 -128 105 -84 65 d

[3] [2] [1] [0] [3] [2] [1] [0]
16
...
static const char Zero[16] = {0};
int vstrlen(const char *string16)

{
int size = 0;
int mask;
for(;;)
{
__m128i temp = _mm_cmpeq_epi8(*(__m128i*)(string16+size), *(__m128i*)Zero);
mask = _mm_movemask_epi8(temp);
if (mask != 0)
break;
size += 16;
}
// We want the real size !!! (and not a multiple of 16)
// so we must modify 'size' according to the content of mask...
#error write your code here and remove this line
return size;
}
17
...
static const char Zero[16] = {0};
int vstrlen(const char *string16)

{
int size = 0;
int mask;
for(;;)
{
__m128i temp = _mm_cmpeq_epi8(*(__m128i*)(string16+size), *(__m128i*)Zero);
mask = _mm_movemask_epi8(temp);
if (mask != 0)
?break;
? ? 0 ? ? ? ? ? ? 0 ? ? ? ? ?
size += 16;
}
// We0want0 the0real0 size
0 !!!0 (and
0 not0 a multiple
0 0 of 0 16)0 0 0 0 0
// so we must modify 'size' according to the content of mask...
_mm_cmpeq_epi8
#error write your code here and remove this line
return size;
}
0 0 0 FF 0 0 0 0 0 0 FF 0 0 0 0 0
_mm_movemask_epi8
32 b mask 0…0 0001 0000 0010 0000
extract the position 18

Parallelization Techniques: Skill 3: Vectorization

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Parallelization Techniques: Skill 3: Vectorization

Uploaded by

Copyright:

Available Formats

Parallelization techniques

single data multiple data

single instruction SISD SIMD

simple complex simple

SISD MIMD SIMD

lower transistor cost than MIMD

simple complex simple

__m128i integer format (8 to 64 b)

operator : name of the assembly instruction

integer comparison instructions

{_mm_cmpeq-lt-gt}_epi{8, 16, 32}

_mm_movemask_epi8 0 0 … … 16b mask of 0 or 1

there are many other SSE2,

{_mm_add, _mm_sub, _mm_mul, _mm_div,

{_mm_and, _mm_andnot, _mm_or, _mm_xor}

// Define the data types once in a header file...

// ... then use the data types in your own code

printf ("%f, %f, %f, %f\n", d[0], d[1], d[2], d[3]);

vectorInt4 e = d >= (vectorFloat4){20.f, 20.f, 20.f, 20.f};

printf ("%x, %x, %x, %x\n", e[0], e[1], e[2], e[3]);

__m128i F = _mm_mulhi_epi16(a.xmm, b.xmm);

c -48 33 -20 9 -128 105 -84 65 d

int vstrlen(const char *string16)

int vstrlen(const char *string16)

extract the position 18

You might also like