Professional Documents
Culture Documents
__global__
std::transform(par, x, x+n, y, do concurrent (i = 1:n) import cunumeric as np void saxpy(int n, float a,
y,[=](float x, float y){ y(i) = y(i) + a*x(i) … float *x, float *y) {
return y + a*x; enddo def saxpy(a, x, y): int i = blockIdx.x*blockDim.x +
} y[:] += a*x threadIdx.x;
); if (i < n) y[i] += a*x[i];
}
int main(void) {
...
cudaMemcpy(d_x, x, ...);
cudaMemcpy(d_y, y, ...);
matrix_product(par, mA, mB, mC); C = matmul(A, B) c = np.matmul(a, b)
saxpy<<<(N+255)/256,256>>>(...);
ACCELERATION LIBRARIES
Core Math Communication Data Analytics AI Quantum
Copyright (C) 2021 Bryce Adelstein Lelbach
5
Common Algorithms that Dispatch to Tools to Write Your Own Parallel Mechanisms for Composing Parallel
Vendor-Optimized Parallel Libraries Algorithms that Run Anywhere Invocations into Task Graphs
sender auto
algorithm (sender auto s) {
return s | bulk(N,
[] (auto data) {
// ... sender auto
algorithm (sender auto s) {
return s | bulk(
[] (auto data) {
} }
// ...
) | bulk(
}
// ...
);
[] (auto data) { }
// ...
}
);
}
M-AIA WITH C++17 PARALLEL ALGORITHMS
Multi-physics simulation framework
from RWTH Aachen University
#pragma omp parallel // OpenMP parallel region
{
#pragma omp for // OpenMP for loop
for (MInt i = 0; i < noCells; i++) { // Loop over all cells
if (timeStep % ipow2[maxLevel_ – clevel[i * distLevel]] == 0) { // Multi-grid loop
Ø Hierarchical grids, complex moving geometries
const MInt distStartId = i * nDist; // More offsets for 1D accesses // Local offsets
const MInt distNeighStartId = i * distNeighbors;
Ø Adaptive meshing, load balancing
const MFloat* const distributionsStart = &[distributions[distStartId];
Ø Numerical methods: FV, DG, LBM, FEM, Level-Set, ...
for (MInt j = 0; j < nDist – 1; j += 2) { // Unrolled loop distributions (factor 2)
if (neighborId[I * distNeighbors + j] > -1) { // First unrolled iteration
Ø Physics: aeroacoustics, combustion, biomedical, ...
const MInt n1StartId = neighborId[distNeighStartId + j] * nDist;
oldDistributions[n1StartId + j] = distributionsStart[j]; // 1D access AoS format Ø Developed by ~20 PhDs (Mech. Eng.), ~500k LOC++
}
if (neighborId[I * distNeighbors + j + 1] > -1) { // Second unrolled iteration Ø Programming model: MPI + ISO C++ parallelism
const MInt n2StartId = neighborId[distNeighStartId + j + 1] * nDist;
oldDistributions[n2StartId + j + 1] = distributionsStart[j + 1];
}
}
oldDistributions[distStartId + lastId] = distributionsStart[lastId]; // Zero-th distribution
}
}
}
8
7
Relative Speed-Up
6
5
4
3
2
Decaying isotropic turbulence 1 1.025
400k fully-resolved particles 1
0
OpenMP (2x EPYC 7742) ISO C++ (2x EPYC 7742) ISO C++ (A100)
PARALLELISM IN C++ ROADMAP
Common Algorithms that Dispatch to Tools to Write Your Own Parallel Mechanisms for Composing Parallel
Vendor-Optimized Parallel Libraries Algorithms that Run Anywhere Invocations into Task Graphs
sender auto
algorithm (sender auto s) {
return s | bulk(N,
[] (auto data) {
// ... sender auto
algorithm (sender auto s) {
return s | bulk(
[] (auto data) {
} }
// ...
) | bulk(
}
// ...
);
[] (auto data) { }
// ...
}
);
}
n_outer_iterations,
repeat_n(
n_inner_iterations,
schedule(scheduler)
| bulk(grid.cells, update_h(accessor))
| transfer(writer)
| then(dump_results(report_step, accessor)))
);
}
ELECTROMAGNETISM
Raw performance & % of peak
std::sync_wait(maxwell(inline_scheduler, inline_scheduler));
std::sync_wait(maxwell(openmp_scheduler, inline_scheduler));
std::sync_wait(maxwell(cuda, inline_scheduler));
30 100
90
25
80
15 50
40
10
30
5 20
10
0 0
OpenMP 128 OpenMP 256 CUDA (1 A100) CUDA (2 A100) OpenMP 128 OpenMP 256 CUDA (1 A100) CUDA (2 A100)
Scheduler Scheduler
35
1
30
0.8
25
Speedup
20 0.6
15
0.4
10
0.2
5 NVIDIA CONFIDENTIAL. DO NOT
DISTRIBUTE.
0 0
0 200 400 600 800 1000 1200
Number of GPUs
NVIDIA SUPERPOD
§ 140x NVIDIA DGX-A100 640
§ 1120x NVIDIA A100-SXM4-80 GPUs
Copyright (C) 2022 NVIDIA
13
Strong Scaling
16
12
0
32 128 224 320 416 512
A100 GPUs
Co-Arrays
Partitioned Global Address Space
arrays, teams of processes (images),
collectives & synchronization.
Awaiting F18.
MINIWEATHER
Standard Language Parallelism in Climate/Weather Applications
MiniWeather
Mini-App written in C++ and Fortran that simulates
weather-like fluid flows using Finite Volume and
Runge-Kutta methods.
Existing parallelization in MPI, OpenMP, OpenACC, …
Included in the SPEChpc benchmark suite*
Open-source and commonly-used in training events.
https://github.com/mrnorman/miniWeather/
enddo
Source: HPC SDK 22.1, AMD EPYC 7742, NVIDIA A100. MiniWeather: NX=2000, NZ=1000, SIM_TIME=5.
*SPEChpc is a trademark of The Standard Performance Evaluation Corporation OpenACC version uses –gpu=managed option.
POT3D: DO CONCURRENT
POT3D
POT3D is a Fortran application for approximating solar
coronal magnetic fields.
Included in the SPEChpc benchmark suite*
Existing parallelization in MPI & OpenACC
Optimized the DO CONCURRENT version by using
OpenACC solely for data motion and atomics
https://github.com/predsci/POT3D
DPU
MICROSCOPY WITH RICHARDSON-LUCY DECONVOLUTION
for _ in range(num_iter):
conv = convolve(im_deconv, psf, mode='same’)
if filter_epsilon:
with np.errstate(invalid='ignore’):
relative_blur = np.where(conv < filter_epsilon, 0,
image / conv)
else:
relative_blur = image / conv
im_deconv *= convolve(relative_blur, psf_mirror,
mode='same’)
if clip:
im_deconv[im_deconv > 1] = 1
im_deconv[im_deconv < -1] = -1
return im_deconv
COMPUTATIONAL FLUID DYNAMICS
Time (seconds)
100
50
for _ in range(iter):
un = u.copy()
0
vn = v.copy() 1 2 4 8 16 32 64 128 256 512 1024
…
Extracted from “CFD Python” course at https://github.com/barbagroup/CFDPython
Barba, Lorena A., and Forsyth, Gilbert F. (2018). CFD Python: the 12 steps to Navier-Stokes equations. Journal of
Open Source Education, 1(9), 21, https://doi.org/10.21105/jose.00021
MICRO-JOIN
vs.
0.5
~1700 LOC
0
8 16 32 64 128
Recently Introduced
7 § FP64 Tensor Core accelerated BLAS3
§ Improved heuristics for GEMV
6 § Batched GEMV
§ Helper functions for improved error management
SPEEDUP OVER CTK 11.1
0
SYRK SYMM TRMM SYRK SYMM TRMM HEMM HERK
DOUBLE REAL DOUBLE COMPLEX
* A100 80GB @ 1095 MHz: CTK 11.1 vs. CTK 11.6U1
v1.2.2 v1.5
120
Releasing cuTENSOR v1.5
• Improvements to Tensor Contractions, up to 33x
100
• Supporting more than 28 modes
TFLOPS (LARGER IS BETTER)
80
60
40
20
0
0 50 100 150 200 250 300
CASES
• DGX A100 80GB
• 300 Random Cases
Multi-Node Multi-GPU FFT in cuFFT
Coming to HPC SDK 22.3
1,400
§ Helper functions: Pencils <-> Slabs
1,000
851
300 278
800
260
600
429 410 200 176
400 158
147
226 210 134 135
122
200 104
109 105
34 17 61 29 52 100 78.2 79
18 10 12 6 18 9 68
0 51
37 41
8 16 32 64 128 256 512 1024 2048 4096 22 30 25 27
13 14
2048 2560 3072 4096 5120 6144 8192 10240 12288 16384 0
# OF GPUS 1024 2048 4096 8192
PROBLEM SIZE (CUBED) PROBLEM SIZE (CUBED)
* Selene: A100 80GB @ 1410 MHz
MATH LIBRARIES DEVICE EXTENSIONS
cuFFTDx Performance: Comparison with cuFFT across various sizes
cuFFTDx cuFFT
25
Released in MathDx 22.02
§ Available on DevZone
§ Support Volta+ architecture
20
§ FFT 1D sizes up to 32k
Future Releases
TFLOPS (LARGER IS BETTER)
§ cuBLASDx/cuSOLVERDx
15 § 2D/3D FFTs
§ Windows Support
10
0
2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768
FFT SIZES (1D)
* A100 80GB @ 1410 MHz
AGENDA
Accelerated Computing with Standard Languages
Fault-Tolerant QC Era:
1000:1-10000:1 redundancy for error-corrected logical qubits.
[Fowler 2012][Reiher 2016]
Exponential speedups on a limited set of applications with
hundreds to thousands of logical qubits (millions of physical qubits).
10,000,000
Fault-Tolerant Quantum Active Research: What are the best error correction algorithms?
Computing Speedups Threshold
1,000,000
Noisy Intermediate Scale Quantum (NISQ) Era:
100,000
Quantum gates are noisy, errors accumulate. Qubits lose coherence.
Physical Qubits
10,000 QC hardware will mitigate errors by using tens to hundreds of redundant physical
1,000
qubits per logical qubit to mitigate errors.
Active Research: Will NISQs have quantum advantage on useful workloads?
100
10
Quantum Supremacy Threshold: Experimental confirmation of quantum speedup on
1 a well-defined (not necessarily useful) problem.
2010 2015 2020 2025 2030 2035 2040
Qubits and quantum gates are very noisy, hardware not very usable.
Active Research: Can this be simulated efficiently on GPU supercomputers?
GPU-BASED SUPERCOMPUTING IN THE QC ECOSYSTEM
Researching the Quantum Computers of Tomorrow with the Supercomputers of Today
cuQuantum
DGX Quantum Appliance Available Now
MULTI-GPU CONTAINER WITH CIRQ/QSIM/CUQUANTUM
cuQuantum
ACCELERATING THE QUANTUM COMPUTING ECOSYSTEM
Leading groups across academia and industry are jump-starting their Quantum Computing Research with cuQuantum
Industry Partners
Supercomputing Partners
CUQUANTUM PARTNER HIGHLIGHTS
cuQuantum-accelerated research in quantum chemistry, climate modeling, quantum supremacy, and more
§ Introducing NVQ++
§ State-of-the-art quantum-classical C++ compiler Leading Framework
§ Interoperable with existing parallel programming models
nvq++
§ Implements a kernel-based approach that can be compiled to the
Quantum Intermediate Representation (QIR)
§ Native support for cuQuantum backend emulation, extensible to
vendor quantum computers
NVQ++