You are on page 1of 42

Thrust

Jared Hoberock and Nathan Bell NVIDIA Research

Diving In
#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/sort.h> int main(void) { // generate 16M random numbers on the host thrust::host_vector<int> h_vec(1 << !)" thrust::generate(h_vec.begin()# h_vec.end()# rand)" // trans$er data to the device thrust::device_vector<int> d_vec % h_vec" // sort data on the device thrust::sort(d_vec.begin()# d_vec.end())" // trans$er data bac& to host thrust::co'((d_vec.begin()# d_vec.end()# h_vec.begin())" return )" *
2008 NVIDIA Corporation

,b-ectives
Programmer productivity
Rapidly develop complex applications everage parallel primitives

!ncourage generic programming


Don"t reinvent the #heel !$g$ one reduction to rule them all

High per%ormance
&ith minimal programmer e%%ort

Interoperability
Integrates #ith '(D) '*'++ code

2008 NVIDIA Corporation

&hat is Thrust.
'++ template library %or '(D)
/imics 0tandard Template ibrary 10T 2

'ontainers
thrust::host_vector<+> thrust::device_vector<+>

)lgorithms
thrust::sort() thrust::reduce() thrust::inclusive_scan() !tc$

2008 NVIDIA Corporation

'ontainers
/ake common operations concise and readable
Hides cudaMalloc3 cudaMemc'( and cuda,ree
// allocate host vector with two elements thrust::host_vector<int> h_vec( )" // copy host vector to device thrust::device_vector<int> d_vec % h_vec" // manipulate device values from the host d_vec-). % 1/" d_vec-1. % 0" std::cout << 1sum: 1 << d_vec-). 2 d_vec-1. << std::endl" // vector memory automatically released w/ free() or cudaFree()

2008 NVIDIA Corporation

'ontainers
'ompatible #ith 0T containers
!ases integration vector3 list3 ma'3 $$$
// list container on host std::list<int> h_list" h_list.'ush_bac&(1/)" h_list.'ush_bac&( 0)" // copy list to device vector thrust::device_vector<int> d_vec(h_list.si3e())" thrust::co'((h_list.begin()# h_list.end()# d_vec.begin())" // alternative method thrust::device_vector<int> d_vec(h_list.begin()# h_list.end())"

2008 NVIDIA Corporation

Iterators
0e4uences de%ined by pair o% iterators
// allocate device vector thrust::device_vector<int> d_vec(!)" d_vec.begin()" // returns iterator at first element of d_vec d_vec.end() // returns iterator one past the last element of d_vec // [begin, end) pair defines a sequence of 4 elements

d_vec.begin()

d_vec.end()

2008 NVIDIA Corporation

Iterators
Iterators act like pointers
// allocate device vector thrust::device_vector<int> d_vec(!)" thrust::device_vector<int>::iterator begin % d_vec.begin()" thrust::device_vector<int>::iterator end % d_vec.end()" int length % end 4 begin" // compute si e of sequence [begin, end) end % d_vec.begin() 2 /" // define a sequence of ! elements

begin

end

2008 NVIDIA Corporation

Iterators
(se iterators like pointers
// allocate device vector thrust::device_vector<int> d_vec(!)" thrust::device_vector<int>::iterator begin % d_vec.begin()" 5begin % 1/" int tem' % 5begin" begin22" 5begin % 6" // same as d_vec["# $ %!& // same as temp $ d_vec["#& // advance iterator one position // same as d_vec[%# $ '(&

2008 NVIDIA Corporation

Iterators
Track memory space 1host*device2
5uides algorithm dispatch
// initiali e random values on host thrust::host_vector<int> h_vec(1))))" thrust::generate(h_vec.begin()# h_vec.end()# rand)" // copy values to device thrust::device_vector<int> d_vec % h_vec" // compute sum on host int h_sum % thrust::reduce(h_vec.begin()# h_vec.end())" // compute sum on device int d_sum % thrust::reduce(d_vec.begin()# d_vec.end())"

2008 NVIDIA Corporation

10

Iterators
'onvertible to ra# pointers

// allocate device vector thrust::device_vector<int> d_vec(!)" // obtain raw pointer to device vector)s memory int 5 'tr % thrust::ra7_'ointer_cast(8d_vec-).)" // use ptr in a *+,- * .ernel m(_&ernel<<<9/ 66# 66>>>(9# 'tr)" // /ote0 ptr cannot be dereferenced on the host1

2008 NVIDIA Corporation

11

Iterators
&rap ra# pointers #ith device_'tr
int 9 % 1)" // raw pointer to device memory int 5 ra7_'tr" cudaMalloc((void 55) 8ra7_'tr# 9 5 si3eo$(int))" // wrap raw pointer with a device_ptr thrust::device_'tr<int> dev_'tr(ra7_'tr)" // use device_ptr in thrust algorithms thrust::$ill(dev_'tr# dev_'tr 2 9# (int) ))" // access device memory through device_ptr dev_'tr-). % 1" // free memory cuda,ree(ra7_'tr)"
12

2008 NVIDIA Corporation

Namespaces
'++ supports namespaces
Thrust uses thrust namespace
thrust::device_vector thrust::co'(

0T uses std namespace


std::vector std::list

)voids collisions
thrust::sort() std::sort()

6or brevity
using names'ace thrust"
13

2008 NVIDIA Corporation

Recap
'ontainers
/anage host 7 device memory )utomatic allocation and deallocation 0impli%y data trans%ers

Iterators
Behave like pointers 8eep track o% memory spaces 'onvertible to ra# pointers

Namespaces
)voids collisions

2008 NVIDIA Corporation

14

'++ Background
6unction templates
// function template to add numbers (type of 2 is variable) tem'late< t('ename + > + add(+ a# + b) { return a 2 b" * // add integers int : % 1)" int ( % 3 % add<int>(:#()" 3 % add(:#()" )" int 3" // type of 2 e3plicitly specified // type of 2 determined automatically

// add floats $loat : % 1).)$" $loat ( % ).)$" $loat 3" 3 % add<$loat>(:#()" // type of 2 e3plicitly specified 3 % add(:#()" // type of 2 determined automatically

2008 NVIDIA Corporation

15

'++ Background
6unction ob-ects 16unctors2
// templated functor to add numbers tem'late< t('ename + > class add { 'ublic: + o'erator()(+ a# + b) { return a 2 b" * *" int : % 1)" int ( % )" int 3" add<int> $unc" // create an add functor for 2$int 3 % $unc(:#()" // invo.e functor on 3 and y $loat : % 1)" $loat ( % )" $loat 3" add<$loat> $unc" // create an add functor for 2$float 3 % $unc(:#()" // invo.e functor on 3 and y
2008 NVIDIA Corporation

16

'++ Background
5eneric )lgorithms
// apply function f to sequences 3, y and store result in tem'late <t('ename +# t('ename ,unction> void trans$orm(int 9# + 5 :# + 5 (# + 5 3# ,unction $) { $or (int i % )" i < 9" i22) 3-i. % $(:-i.# (-i.)" * int 9 % 1))" int :-9." int (-9." int 3-9." add<int> $unc" trans$orm(9# :# (# 3# $unc)" // add functor for 2$int // compute [i# $ 3[i# 4 y[i#

trans$orm(9# :# (# 3# add<int>())" // equivalent

2008 NVIDIA Corporation

17

)lgorithms
Thrust provides many standard algorithms
Trans%ormations Reductions Pre%ix 0ums 0orting

5eneric de%initions
5eneral Types
Built9in types 1int3 $loat3 :2 (ser9de%ined structures

5eneral ,perators
reduce #ith 'lus operator scan #ith ma:imum operator
2008 NVIDIA Corporation

18

)lgorithms
5eneral types and operators
#include <thrust/reduce.h> // declare storage device_vector<int> i_vec % ... device_vector<$loat> $_vec % ... // sum of integers (equivalent calls) reduce(i_vec.begin()# i_vec.end())" reduce(i_vec.begin()# i_vec.end()# )# 'lus<int>())" // sum of floats (equivalent calls) reduce($_vec.begin()# $_vec.end())" reduce($_vec.begin()# $_vec.end()# ).)$# 'lus<$loat>())" // ma3imum of integers reduce(i_vec.begin()# i_vec.end()# )# ma:imum<int>())"

2008 NVIDIA Corporation

19

)lgorithms
5eneral types and operators
struct negate_$loat { __host__ __device__ $loat o'erator()($loat a) { return ma&e_$loat (4a.:# 4a.()" * *" // declare storage device_vector<$loat > in'ut % ... device_vector<$loat > out'ut % ... // create functor negate_$loat $unc" // negate vectors trans$orm(in'ut.begin()# in'ut.end()# out'ut.begin()# $unc)"
2008 NVIDIA Corporation

20

)lgorithms
5eneral types and operators
// compare 3 component of two float' structures struct com'are_$loat { __host__ __device__ bool o'erator()($loat a# $loat b) { return a.: < b.:" * *" // declare storage device_vector<$loat > vec % ... // create comparison functor com'are_$loat com'" // sort elements by 3 component sort(vec.begin()# vec.end()# com')"
2008 NVIDIA Corporation

21

)lgorithms
,perators #ith 0tate
// compare 3 component of two float' structures struct is_greater_than { int threshold" is_greater_than(int t) { threshold % t" * __host__ __device__ bool o'erator()(int :) { return : > threshold" * *" device_vector<int> vec % ... // create predicate functor (returns true for 3 5 %") is_greater_than 'red(1))" // count number of values 5 %" int result % count_i$(vec.begin()# vec.end()# 'red)"
2008 NVIDIA Corporation

22

Recap
)lgorithms
5eneric
0upport general types and operators

0tatically dispatched based on iterator type


/emory space is kno#n at compile time

Have de%ault arguments


reduce(begin# end) reduce(begin# end# init# binar(_o')

2008 NVIDIA Corporation

23

6ancy Iterators
Behave like ;normal< iterators
)lgorithms don=t kno# the di%%erence

!xamples
constant_iterator counting_iterator trans$orm_iterator 'ermutation_iterator 3i'_iterator

2008 NVIDIA Corporation

24

6ancy Iterators
constant_iterator
/imics an in%inite array %illed #ith a constant value
// create iterators constant_iterator<int> begin(1))" constant_iterator<int> end % begin 2 /" begin-). // returns %" begin-1. // returns %" begin-1)). // returns %" // sum of [begin, end) reduce(begin# end)" // returns !" (i6e6 ! 7 %")

A
2008 NVIDIA Corporation

A
25

6ancy Iterators
counting_iterator
/imics an in%inite array #ith se4uential values
// create iterators counting_iterator<int> begin(1))" counting_iterator<int> end % begin 2 /" begin-). // returns %" begin-1. // returns %% begin-1)). // returns %%" // sum of [begin, end) reduce(begin# end)" // returns !! (i6e6 %" 4 %% 4 %')

0
2008 NVIDIA Corporation

3
26

6ancy Iterators
trans$orm_iterator
>ields a trans%ormed se4uence 6acilitates kernel %usion

F( x )
X Y Z

F(

) F(

) F(

2008 NVIDIA Corporation

27

6ancy Iterators
trans$orm_iterator
'onserves memory capacity and band#idth
// initiali e vector device_vector<int> vec(/)" vec-). % 1)" vec-1. % )" vec- . % /)" // create iterator (type omitted) begin % ma&e_trans$orm_iterator(vec.begin()# negate<int>())" end % ma&e_trans$orm_iterator(vec.end()# negate<int>())" begin-). begin-1. begin- . // returns 8%" // returns 8'" // returns 8!"

// sum of [begin, end) reduce(begin# end)" // returns 89" (i6e6 8%" 4 8'" 4 8!")

2008 NVIDIA Corporation

28

6ancy Iterators
3i'_iterator
ooks like an array o% structs 1)o02 0tored in structure o% arrays 10o)2

C A X B Y C Z

2008 NVIDIA Corporation

29

6ancy Iterators
3i'_iterator
// initiali e vectors device_vector<int> ;(/)" device_vector<char> <(/)" ;-). % 1)" ;-1. % )" ;- . % /)" <-). % =:>" <-1. % =(>" <- . % =3>" // create iterator (type omitted) begin % ma&e_3i'_iterator(ma&e_tu'le(;.begin()# <.begin()))" end % ma&e_3i'_iterator(ma&e_tu'le(;.end()# <.end()))" begin-). begin-1. begin- . // returns tuple(%", :3)) // returns tuple('", :y)) // returns tuple(!", : ))

// ma3imum of [begin, end) ma:imum< tu'le<int#char> > binar(_o'" reduce(begin# end# begin-).# binar(_o')" // returns tuple(!", : ))

2008 NVIDIA Corporation

30

Best Practices
6usion
'ombine related operations together

0tructure o% )rrays
!nsure memory coalescing

Implicit 0e4uences
!liminate memory accesses

2008 NVIDIA Corporation

31

6usion
'ombine related operations together
'onserves memory band#idth

!xample? 0NR/@
04uare each element 'ompute sum o% s4uares and take s?rt()

2008 NVIDIA Corporation

32

6usion
(noptimiAed implementation
// define transformation f(3) 85 3;' struct s?uare { __host__ __device__ $loat o'erator()($loat :) { return : 5 :" * *" $loat snrm _slo7(device_vector<$loat>8 :) { // without fusion device_vector<$loat> tem'(:.si3e())" trans$orm(:.begin()# :.end()# tem'.begin()# s?uare())" return s?rt( reduce(tem'.begin()# tem'.end()) )" *
2008 NVIDIA Corporation

33

6usion
,ptimiAed implementation 1B$Cx %aster2
// define transformation f(3) 85 3;' struct s?uare { __host__ __device__ $loat o'erator()($loat :) { return : 5 :" * *" $loat snrm _$ast(device_vector<$loat>8 :) { // with fusion return s?rt( trans$orm_reduce(:.begin()# :.end()# s?uare()# ).)$# 'lus<$loat>())" *

2008 NVIDIA Corporation

34

0tructure o% )rrays 10o)2


)rray o% 0tructures 1)o02
,%ten does not obey coalescing rules
device_vector<$loat/>

0tructure o% )rrays 10o)2


,beys coalescing rules 'omponents stored in separate arrays
device_vector<$loat> :# (# 3"

!xample? Rotate Bd vectors


0o) is @$Cx %aster

2008 NVIDIA Corporation

35

0tructure o% )rrays 10o)2


struct rotate_$loat/ { __host__ __device__ $loat/ o'erator()($loat/ v) { $loat : % v.:" $loat ( % v.(" $loat 3 % v.3" $loat r: % )./6$5: 2 $loat r( %4).@)$5: 2 $loat r3 % ).!@$5: 2 * *" ... device_vector<$loat/> vec(9)" trans$orm(vec.begin()# vec.end# vec.begin()# rotate_$loat/())" ).!@$5( 2 4).@)$53" ).6)$5( 2 ).))$53" ).6!$5( 2 ).6)$53"

return ma&e_$loat/(r:# r(# r3)"

2008 NVIDIA Corporation

36

0tructure o% )rrays 10o)2


struct rotate_tu'le { __host__ __device__ tu'le<$loat#$loat#$loat> o'erator()(tu'le<$loat#$loat#$loat> v) { $loat : % get<)>(v)" $loat ( % get<1>(v)" $loat 3 % get< >(v)" $loat r: % )./6$5: 2 $loat r( %4).@)$5: 2 $loat r3 % ).!@$5: 2 * *" ... device_vector<$loat> :(9)# ((9)# 3(9)" trans$orm(ma&e_3i'_iterator(ma&e_tu'le(:.begin()# (.begin()# 3.begin()))# ma&e_3i'_iterator(ma&e_tu'le(:.end()# (.end()# 3.end()))# ma&e_3i'_iterator(ma&e_tu'le(:.begin()# (.begin()# 3.begin()))# rotate_tu'le())" ).!@$5( 2 4).@)$53" ).6)$5( 2 ).))$53" ).6!$5( 2 ).6)$53"

return ma&e_tu'le(r:# r(# r3)"

2008 NVIDIA Corporation

37

Implicit 0e4uences
)void storing se4uences explicitly
'onstant se4uences
-1# 1# 1# 1# A .

Incrementing se4uences
-)# 1# # /# A .

Implicit se4uences re4uire no storage


constant_iterator counting_iterator

!xample
Index o% the smallest element
2008 NVIDIA Corporation

38

Implicit 0e4uences
// return the smaller of two tuples struct smaller_tu'le { tu'le<$loat#int> o'erator()(tu'le<$loat#int> a# tu'le<$loat#int> b) { i$ (a < b) return a" else return b" * *" int min_inde:(device_vector<$loat>8 vec) { // create e3plicit inde3 sequence [", %, ', 666 ) device_vector<int> indices(vec.si3e())" se?uence(indices.begin()# indices.end())" tu'le<$loat#int> init(vec-).#))" tu'le<$loat#int> smallest" smallest % reduce(ma&e_3i'_iterator(ma&e_tu'le(vec.begin()# indices.begin()))# ma&e_3i'_iterator(ma&e_tu'le(vec.end()# indices.end()))# init# smaller_tu'le())" return get<1>(smallest)" *
2008 NVIDIA Corporation

39

Implicit 0e4uences
// return the smaller of two tuples struct smaller_tu'le { tu'le<$loat#int> o'erator()(tu'le<$loat#int> a# tu'le<$loat#int> b) { i$ (a < b) return a" else return b" * *" int min_inde:(device_vector<$loat>8 vec) { // create implicit inde3 sequence [", %, ', 666 ) counting_iterator<int> begin())" counting_iterator<int> end(vec.si3e())" tu'le<$loat#int> init(vec-).#))" tu'le<$loat#int> smallest" smallest % reduce(ma&e_3i'_iterator(ma&e_tu'le(vec.begin()# begin))# ma&e_3i'_iterator(ma&e_tu'le(vec.end()# end))# init# smaller_tu'le())" return get<1>(small)" *
2008 NVIDIA Corporation

40

Recap
Best Practices
6usion
B$Cx %aster

0tructure o% )rrays
@$Cx %aster

Implicit 0e4uences
B$Dx %aster

2008 NVIDIA Corporation

41

)dditional Resources
Thrust
Homepage Euick 0tart 5uide Documentation !xamples /egaNe#tons 1blog2 thrust9users 1mailing list2

,ther
NFIDI) Research '(D)

2008 NVIDIA Corporation

42

You might also like