Professional Documents
Culture Documents
Parallelization Techniques: Skill 3: Vectorization
Parallelization Techniques: Skill 3: Vectorization
Skill 3: vectorization
multiple
- MIMD
instructions
2
SISD MIMD SIMD
4
simpleSISD MIMD bank)
to add (special register SIMD
a a1 a2 a3 a4
b b1 b2 b3 b4
+ 1 instruction + + + +
c c1 c2 c3 c4
6
Altivec :
ARM NEON :
PowerPC G4, PowerPC
ARM v5, ARM v6, Cortex
G5, Cell processor
Mobile phones, tablets,
Playstation 3, Wii, XBox
HDTV sets
360, TV sets
portability ?…
MMX, 3DNow!, SSE, …
x86 (Intel, AMD, Via, …)
Playstation 4, XBox One, PC,
tablets, mobile phones…
7
SSE2
XMM data format
__m128 __m128d
four two
32 bits FP 64 bits FP
8
SSE2
generic instruction format
_mm_operator_format
format : {p, s}{s, d} => {packed, scalar}{single, double}
ex. : ps : packed single ; sd : scalar double
{ , e}{p, s}{i, u}{8, 16, 32, 64, 128} => {MMX,
XMM}, {packed, scalar}, {integer, unsigned} + size
ex. : pi8 : 8 MMX bytes ; epi32 : 4 32 bits XMM integers
note : si64 & si128 : entire register only
logic instructions
_mm_and_si128, _mm_andnot_si128,
_mm_or_si128, _mm_xor_si128
_mm_andnot_si128:
a = a and not(b)
a = not(a) and b
10
transverse 8 & 16 bits operations
_mm_sad_epu8, _mm_madd_epi16
- - - - - - - - - - - - - - - -
* * * * * * * *
|| || || || || || || || || || || || || || || ||
+ + + + + +
0 0 0 0 0 0 0 0 0 0 0 0
_mm_sad_epu8 _mm_madd_epi16
11
a*1/b ≈ a/b
{_mm_rcp, _mm_rsqrt}_{p, s}{s}
a*1/√a ≈ √a
13
Our example:
#include <stdio.h>
return 0;
}
14
#include <stdio.h>
#include <emmintrin.h>
int main(void)
{
union
{
short entier[8];
__m128i xmm;
} a = {1, 2, 3, 4, 5, 6, 7, 8}, b = {9, -10, 11, -12, 13, -14, 15, -16};
union
{
__m128i xmm;
int entier[4];
} c, d;
printf ("%d, %d, %d, %d, ", c.entier[0], c.entier[1], c.entier[2], c.entier[3]);
printf ("%d, %d, %d, %d\n", d.entier[0], d.entier[1], d.entier[2], d.entier[3]);
return 0;
}
15
_mm_mulhi_epi16 _mm_mullo_epi16
a 8 7 6 5 4 3 2 1 8 7 6 5 4 3 2 1 a
* * * * * * * * * * * * * * * *
b -16 15 -14 13 -12 11 -10 9 -16 15 -14 13 -12 11 -10 9 b
F h h h h h h h h l l l l l l l l f
c h l h l h l h l h l h l h l h l d
_mm_unpacklo_epi16 _mm_unpackhi_epi16
very similar
16
...
static const char Zero[16] = {0};
17
...
static const char Zero[16] = {0};
_mm_cmpeq_epi8
#error write your code here and remove this line
return size;
}
0 0 0 FF 0 0 0 0 0 0 FF 0 0 0 0 0
_mm_movemask_epi8
32 b mask 0…0 0001 0000 0010 0000