Professional Documents
Culture Documents
An Introduction To Thrust
An Introduction To Thrust
An Introduction To Thrust
Diving In
#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/sort.h> int main(void) { // generate 16M random numbers on the host thrust::host_vector<int> h_vec(1 << !)" thrust::generate(h_vec.begin()# h_vec.end()# rand)" // trans$er data to the device thrust::device_vector<int> d_vec % h_vec" // sort data on the device thrust::sort(d_vec.begin()# d_vec.end())" // trans$er data bac& to host thrust::co'((d_vec.begin()# d_vec.end()# h_vec.begin())" return )" *
2008 NVIDIA Corporation
,b-ectives
Programmer productivity
Rapidly develop complex applications everage parallel primitives
High per%ormance
&ith minimal programmer e%%ort
Interoperability
Integrates #ith '(D) '*'++ code
&hat is Thrust.
'++ template library %or '(D)
/imics 0tandard Template ibrary 10T 2
'ontainers
thrust::host_vector<+> thrust::device_vector<+>
)lgorithms
thrust::sort() thrust::reduce() thrust::inclusive_scan() !tc$
'ontainers
/ake common operations concise and readable
Hides cudaMalloc3 cudaMemc'( and cuda,ree
// allocate host vector with two elements thrust::host_vector<int> h_vec( )" // copy host vector to device thrust::device_vector<int> d_vec % h_vec" // manipulate device values from the host d_vec-). % 1/" d_vec-1. % 0" std::cout << 1sum: 1 << d_vec-). 2 d_vec-1. << std::endl" // vector memory automatically released w/ free() or cudaFree()
'ontainers
'ompatible #ith 0T containers
!ases integration vector3 list3 ma'3 $$$
// list container on host std::list<int> h_list" h_list.'ush_bac&(1/)" h_list.'ush_bac&( 0)" // copy list to device vector thrust::device_vector<int> d_vec(h_list.si3e())" thrust::co'((h_list.begin()# h_list.end()# d_vec.begin())" // alternative method thrust::device_vector<int> d_vec(h_list.begin()# h_list.end())"
Iterators
0e4uences de%ined by pair o% iterators
// allocate device vector thrust::device_vector<int> d_vec(!)" d_vec.begin()" // returns iterator at first element of d_vec d_vec.end() // returns iterator one past the last element of d_vec // [begin, end) pair defines a sequence of 4 elements
d_vec.begin()
d_vec.end()
Iterators
Iterators act like pointers
// allocate device vector thrust::device_vector<int> d_vec(!)" thrust::device_vector<int>::iterator begin % d_vec.begin()" thrust::device_vector<int>::iterator end % d_vec.end()" int length % end 4 begin" // compute si e of sequence [begin, end) end % d_vec.begin() 2 /" // define a sequence of ! elements
begin
end
Iterators
(se iterators like pointers
// allocate device vector thrust::device_vector<int> d_vec(!)" thrust::device_vector<int>::iterator begin % d_vec.begin()" 5begin % 1/" int tem' % 5begin" begin22" 5begin % 6" // same as d_vec["# $ %!& // same as temp $ d_vec["#& // advance iterator one position // same as d_vec[%# $ '(&
Iterators
Track memory space 1host*device2
5uides algorithm dispatch
// initiali e random values on host thrust::host_vector<int> h_vec(1))))" thrust::generate(h_vec.begin()# h_vec.end()# rand)" // copy values to device thrust::device_vector<int> d_vec % h_vec" // compute sum on host int h_sum % thrust::reduce(h_vec.begin()# h_vec.end())" // compute sum on device int d_sum % thrust::reduce(d_vec.begin()# d_vec.end())"
10
Iterators
'onvertible to ra# pointers
// allocate device vector thrust::device_vector<int> d_vec(!)" // obtain raw pointer to device vector)s memory int 5 'tr % thrust::ra7_'ointer_cast(8d_vec-).)" // use ptr in a *+,- * .ernel m(_&ernel<<<9/ 66# 66>>>(9# 'tr)" // /ote0 ptr cannot be dereferenced on the host1
11
Iterators
&rap ra# pointers #ith device_'tr
int 9 % 1)" // raw pointer to device memory int 5 ra7_'tr" cudaMalloc((void 55) 8ra7_'tr# 9 5 si3eo$(int))" // wrap raw pointer with a device_ptr thrust::device_'tr<int> dev_'tr(ra7_'tr)" // use device_ptr in thrust algorithms thrust::$ill(dev_'tr# dev_'tr 2 9# (int) ))" // access device memory through device_ptr dev_'tr-). % 1" // free memory cuda,ree(ra7_'tr)"
12
Namespaces
'++ supports namespaces
Thrust uses thrust namespace
thrust::device_vector thrust::co'(
)voids collisions
thrust::sort() std::sort()
6or brevity
using names'ace thrust"
13
Recap
'ontainers
/anage host 7 device memory )utomatic allocation and deallocation 0impli%y data trans%ers
Iterators
Behave like pointers 8eep track o% memory spaces 'onvertible to ra# pointers
Namespaces
)voids collisions
14
'++ Background
6unction templates
// function template to add numbers (type of 2 is variable) tem'late< t('ename + > + add(+ a# + b) { return a 2 b" * // add integers int : % 1)" int ( % 3 % add<int>(:#()" 3 % add(:#()" )" int 3" // type of 2 e3plicitly specified // type of 2 determined automatically
// add floats $loat : % 1).)$" $loat ( % ).)$" $loat 3" 3 % add<$loat>(:#()" // type of 2 e3plicitly specified 3 % add(:#()" // type of 2 determined automatically
15
'++ Background
6unction ob-ects 16unctors2
// templated functor to add numbers tem'late< t('ename + > class add { 'ublic: + o'erator()(+ a# + b) { return a 2 b" * *" int : % 1)" int ( % )" int 3" add<int> $unc" // create an add functor for 2$int 3 % $unc(:#()" // invo.e functor on 3 and y $loat : % 1)" $loat ( % )" $loat 3" add<$loat> $unc" // create an add functor for 2$float 3 % $unc(:#()" // invo.e functor on 3 and y
2008 NVIDIA Corporation
16
'++ Background
5eneric )lgorithms
// apply function f to sequences 3, y and store result in tem'late <t('ename +# t('ename ,unction> void trans$orm(int 9# + 5 :# + 5 (# + 5 3# ,unction $) { $or (int i % )" i < 9" i22) 3-i. % $(:-i.# (-i.)" * int 9 % 1))" int :-9." int (-9." int 3-9." add<int> $unc" trans$orm(9# :# (# 3# $unc)" // add functor for 2$int // compute [i# $ 3[i# 4 y[i#
17
)lgorithms
Thrust provides many standard algorithms
Trans%ormations Reductions Pre%ix 0ums 0orting
5eneric de%initions
5eneral Types
Built9in types 1int3 $loat3 :2 (ser9de%ined structures
5eneral ,perators
reduce #ith 'lus operator scan #ith ma:imum operator
2008 NVIDIA Corporation
18
)lgorithms
5eneral types and operators
#include <thrust/reduce.h> // declare storage device_vector<int> i_vec % ... device_vector<$loat> $_vec % ... // sum of integers (equivalent calls) reduce(i_vec.begin()# i_vec.end())" reduce(i_vec.begin()# i_vec.end()# )# 'lus<int>())" // sum of floats (equivalent calls) reduce($_vec.begin()# $_vec.end())" reduce($_vec.begin()# $_vec.end()# ).)$# 'lus<$loat>())" // ma3imum of integers reduce(i_vec.begin()# i_vec.end()# )# ma:imum<int>())"
19
)lgorithms
5eneral types and operators
struct negate_$loat { __host__ __device__ $loat o'erator()($loat a) { return ma&e_$loat (4a.:# 4a.()" * *" // declare storage device_vector<$loat > in'ut % ... device_vector<$loat > out'ut % ... // create functor negate_$loat $unc" // negate vectors trans$orm(in'ut.begin()# in'ut.end()# out'ut.begin()# $unc)"
2008 NVIDIA Corporation
20
)lgorithms
5eneral types and operators
// compare 3 component of two float' structures struct com'are_$loat { __host__ __device__ bool o'erator()($loat a# $loat b) { return a.: < b.:" * *" // declare storage device_vector<$loat > vec % ... // create comparison functor com'are_$loat com'" // sort elements by 3 component sort(vec.begin()# vec.end()# com')"
2008 NVIDIA Corporation
21
)lgorithms
,perators #ith 0tate
// compare 3 component of two float' structures struct is_greater_than { int threshold" is_greater_than(int t) { threshold % t" * __host__ __device__ bool o'erator()(int :) { return : > threshold" * *" device_vector<int> vec % ... // create predicate functor (returns true for 3 5 %") is_greater_than 'red(1))" // count number of values 5 %" int result % count_i$(vec.begin()# vec.end()# 'red)"
2008 NVIDIA Corporation
22
Recap
)lgorithms
5eneric
0upport general types and operators
23
6ancy Iterators
Behave like ;normal< iterators
)lgorithms don=t kno# the di%%erence
!xamples
constant_iterator counting_iterator trans$orm_iterator 'ermutation_iterator 3i'_iterator
24
6ancy Iterators
constant_iterator
/imics an in%inite array %illed #ith a constant value
// create iterators constant_iterator<int> begin(1))" constant_iterator<int> end % begin 2 /" begin-). // returns %" begin-1. // returns %" begin-1)). // returns %" // sum of [begin, end) reduce(begin# end)" // returns !" (i6e6 ! 7 %")
A
2008 NVIDIA Corporation
A
25
6ancy Iterators
counting_iterator
/imics an in%inite array #ith se4uential values
// create iterators counting_iterator<int> begin(1))" counting_iterator<int> end % begin 2 /" begin-). // returns %" begin-1. // returns %% begin-1)). // returns %%" // sum of [begin, end) reduce(begin# end)" // returns !! (i6e6 %" 4 %% 4 %')
0
2008 NVIDIA Corporation
3
26
6ancy Iterators
trans$orm_iterator
>ields a trans%ormed se4uence 6acilitates kernel %usion
F( x )
X Y Z
F(
) F(
) F(
27
6ancy Iterators
trans$orm_iterator
'onserves memory capacity and band#idth
// initiali e vector device_vector<int> vec(/)" vec-). % 1)" vec-1. % )" vec- . % /)" // create iterator (type omitted) begin % ma&e_trans$orm_iterator(vec.begin()# negate<int>())" end % ma&e_trans$orm_iterator(vec.end()# negate<int>())" begin-). begin-1. begin- . // returns 8%" // returns 8'" // returns 8!"
// sum of [begin, end) reduce(begin# end)" // returns 89" (i6e6 8%" 4 8'" 4 8!")
28
6ancy Iterators
3i'_iterator
ooks like an array o% structs 1)o02 0tored in structure o% arrays 10o)2
C A X B Y C Z
29
6ancy Iterators
3i'_iterator
// initiali e vectors device_vector<int> ;(/)" device_vector<char> <(/)" ;-). % 1)" ;-1. % )" ;- . % /)" <-). % =:>" <-1. % =(>" <- . % =3>" // create iterator (type omitted) begin % ma&e_3i'_iterator(ma&e_tu'le(;.begin()# <.begin()))" end % ma&e_3i'_iterator(ma&e_tu'le(;.end()# <.end()))" begin-). begin-1. begin- . // returns tuple(%", :3)) // returns tuple('", :y)) // returns tuple(!", : ))
// ma3imum of [begin, end) ma:imum< tu'le<int#char> > binar(_o'" reduce(begin# end# begin-).# binar(_o')" // returns tuple(!", : ))
30
Best Practices
6usion
'ombine related operations together
0tructure o% )rrays
!nsure memory coalescing
Implicit 0e4uences
!liminate memory accesses
31
6usion
'ombine related operations together
'onserves memory band#idth
!xample? 0NR/@
04uare each element 'ompute sum o% s4uares and take s?rt()
32
6usion
(noptimiAed implementation
// define transformation f(3) 85 3;' struct s?uare { __host__ __device__ $loat o'erator()($loat :) { return : 5 :" * *" $loat snrm _slo7(device_vector<$loat>8 :) { // without fusion device_vector<$loat> tem'(:.si3e())" trans$orm(:.begin()# :.end()# tem'.begin()# s?uare())" return s?rt( reduce(tem'.begin()# tem'.end()) )" *
2008 NVIDIA Corporation
33
6usion
,ptimiAed implementation 1B$Cx %aster2
// define transformation f(3) 85 3;' struct s?uare { __host__ __device__ $loat o'erator()($loat :) { return : 5 :" * *" $loat snrm _$ast(device_vector<$loat>8 :) { // with fusion return s?rt( trans$orm_reduce(:.begin()# :.end()# s?uare()# ).)$# 'lus<$loat>())" *
34
35
36
37
Implicit 0e4uences
)void storing se4uences explicitly
'onstant se4uences
-1# 1# 1# 1# A .
Incrementing se4uences
-)# 1# # /# A .
!xample
Index o% the smallest element
2008 NVIDIA Corporation
38
Implicit 0e4uences
// return the smaller of two tuples struct smaller_tu'le { tu'le<$loat#int> o'erator()(tu'le<$loat#int> a# tu'le<$loat#int> b) { i$ (a < b) return a" else return b" * *" int min_inde:(device_vector<$loat>8 vec) { // create e3plicit inde3 sequence [", %, ', 666 ) device_vector<int> indices(vec.si3e())" se?uence(indices.begin()# indices.end())" tu'le<$loat#int> init(vec-).#))" tu'le<$loat#int> smallest" smallest % reduce(ma&e_3i'_iterator(ma&e_tu'le(vec.begin()# indices.begin()))# ma&e_3i'_iterator(ma&e_tu'le(vec.end()# indices.end()))# init# smaller_tu'le())" return get<1>(smallest)" *
2008 NVIDIA Corporation
39
Implicit 0e4uences
// return the smaller of two tuples struct smaller_tu'le { tu'le<$loat#int> o'erator()(tu'le<$loat#int> a# tu'le<$loat#int> b) { i$ (a < b) return a" else return b" * *" int min_inde:(device_vector<$loat>8 vec) { // create implicit inde3 sequence [", %, ', 666 ) counting_iterator<int> begin())" counting_iterator<int> end(vec.si3e())" tu'le<$loat#int> init(vec-).#))" tu'le<$loat#int> smallest" smallest % reduce(ma&e_3i'_iterator(ma&e_tu'le(vec.begin()# begin))# ma&e_3i'_iterator(ma&e_tu'le(vec.end()# end))# init# smaller_tu'le())" return get<1>(small)" *
2008 NVIDIA Corporation
40
Recap
Best Practices
6usion
B$Cx %aster
0tructure o% )rrays
@$Cx %aster
Implicit 0e4uences
B$Dx %aster
41
)dditional Resources
Thrust
Homepage Euick 0tart 5uide Documentation !xamples /egaNe#tons 1blog2 thrust9users 1mailing list2
,ther
NFIDI) Research '(D)
42