template<typename Creator, typename Deleter>
cudaGraphBase class
class to create a CUDA graph managed by C++ smart pointer
| Template parameters | |
|---|---|
| Creator | functor to create the stream (used in constructor) |
| Deleter | functor to delete the stream (used in destructor) |
This class wraps a cudaGraph_t handle with std::
Public types
-
using base_type = std::
unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter> - base std::
unique_ptr type
Constructors, destructors, conversion operators
-
template<typename... ArgsT>cudaGraphBase(ArgsT && ... args) explicit
- constructs a
cudaGraphobject by passing the given arguments to the executable CUDA graph creator - cudaGraphBase(cudaGraphBase&&) defaulted
- constructs a
cudaGraphfrom the given rhs using move semantics
Public functions
- auto operator=(cudaGraphBase&&) -> cudaGraphBase& defaulted
- assign the rhs to
*thisusing move semantics - auto num_nodes() const -> size_t
- queries the number of nodes in a native CUDA graph
- auto num_edges() const -> size_t
- queries the number of edges in a native CUDA graph
- auto empty() const -> bool
- queries if the graph is empty
-
void dump(std::
ostream& os) - dumps the CUDA graph to a DOT format through the given output stream
- auto noop() -> cudaTask
- creates a no-operation task
-
template<typename C>auto host(C&& callable, void* user_data) -> cudaTask
- creates a host task that runs a callable on the host
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask
- creates a kernel task
- auto memset(void* dst, int v, size_t count) -> cudaTask
- creates a memset task that fills untyped data with a byte value
- auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
- creates a memcpy task that copies untyped data in bytes
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto zero(T* dst, size_t count) -> cudaTask
- creates a memset task that sets a typed memory block to zero
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto fill(T* dst, T value, size_t count) -> cudaTask
- creates a memset task that fills a typed memory block with a value
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- creates a memcopy task that copies typed data
-
template<typename C>auto single_task(C c) -> cudaTask
- runs a callable with only a single kernel thread
-
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>auto for_each(I first, I last, C callable) -> cudaTask
- applies a callable to each dereferenced element of the data array
-
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- applies a callable to each index in the range with the step size
-
template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>auto transform(I first, I last, O output, C op) -> cudaTask
- applies a callable to a source range and stores the result in a target range
-
template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- creates a task to perform parallel transforms over two ranges of items
Function documentation
template<typename Creator, typename Deleter>
template<typename... ArgsT>
tf:: cudaGraphBase<Creator, Deleter>:: cudaGraphBase(ArgsT && ... args) explicit
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
| Parameters | |
|---|---|
| args | arguments to pass to the executable CUDA graph creator |
Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
template<typename Creator, typename Deleter>
void tf:: cudaGraphBase<Creator, Deleter>:: dump(std:: ostream& os)
dumps the CUDA graph to a DOT format through the given output stream
| Parameters | |
|---|---|
| os | target output stream |
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: noop()
creates a no-operation task
| Returns | a tf:: |
|---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
template<typename Creator, typename Deleter>
template<typename C>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: host(C&& callable,
void* user_data)
creates a host task that runs a callable on the host
| Template parameters | |
|---|---|
| C | callable type |
| Parameters | |
| callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()>) |
| user_data | a pointer to the user data |
| Returns | a tf:: |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).
template<typename Creator, typename Deleter>
template<typename F, typename... ArgsT>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT... args)
creates a kernel task
| Template parameters | |
|---|---|
| F | kernel function type |
| ArgsT | kernel function parameters type |
| Parameters | |
| g | configured grid |
| b | configured block |
| s | configured shared memory size in bytes |
| f | kernel function |
| args | arguments to forward to the kernel function by copy |
| Returns | a tf:: |
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: memset(void* dst,
int v,
size_t count)
creates a memset task that fills untyped data with a byte value
| Parameters | |
|---|---|
| dst | pointer to the destination device memory area |
| v | value to set for each byte of specified memory |
| count | size in bytes to set |
| Returns | a tf:: |
A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: memcpy(void* tgt,
const void* src,
size_t bytes)
creates a memcpy task that copies untyped data in bytes
| Parameters | |
|---|---|
| tgt | pointer to the target memory block |
| src | pointer to the source memory block |
| bytes | bytes to copy |
| Returns | a tf:: |
A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: zero(T* dst,
size_t count)
creates a memset task that sets a typed memory block to zero
| Template parameters | |
|---|---|
| T | element type (size of T must be either 1, 2, or 4) |
| Parameters | |
| dst | pointer to the destination device memory area |
| count | number of elements |
| Returns | a tf:: |
A zero task zeroes the first count elements of type T in a device memory area pointed by dst.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: fill(T* dst,
T value,
size_t count)
creates a memset task that fills a typed memory block with a value
| Template parameters | |
|---|---|
| T | element type (size of T must be either 1, 2, or 4) |
| Parameters | |
| dst | pointer to the destination device memory area |
| value | value to fill for each element of type T |
| count | number of elements |
| Returns | a tf:: |
A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: copy(T* tgt,
const T* src,
size_t num)
creates a memcopy task that copies typed data
| Template parameters | |
|---|---|
| T | element type (non-void) |
| Parameters | |
| tgt | pointer to the target memory block |
| src | pointer to the source memory block |
| num | number of elements to copy |
| Returns | a tf:: |
A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename Creator, typename Deleter>
template<typename C>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: single_task(C c)
runs a callable with only a single kernel thread
| Template parameters | |
|---|---|
| C | callable type |
| Parameters | |
| c | callable to run by a single kernel thread |
| Returns | a tf:: |
template<typename Creator, typename Deleter>
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: for_each(I first,
I last,
C callable)
applies a callable to each dereferenced element of the data array
| Template parameters | |
|---|---|
| I | iterator type |
| C | callable type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| Parameters | |
| first | iterator to the beginning (inclusive) |
| last | iterator to the end (exclusive) |
| callable | a callable object to apply to the dereferenced iterator |
| Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { callable(*itr); }
template<typename Creator, typename Deleter>
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: for_each_index(I first,
I last,
I step,
C callable)
applies a callable to each index in the range with the step size
| Template parameters | |
|---|---|
| I | index type |
| C | callable type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| Parameters | |
| first | beginning index |
| last | last index |
| step | step size |
| callable | the callable to apply to each element in the data array |
| Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename Creator, typename Deleter>
template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: transform(I first,
I last,
O output,
C op)
applies a callable to a source range and stores the result in a target range
| Template parameters | |
|---|---|
| I | input iterator type |
| O | output iterator type |
| C | unary operator type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| Parameters | |
| first | iterator to the beginning of the input range |
| last | iterator to the end of the input range |
| output | iterator to the beginning of the output range |
| op | the operator to apply to transform each element in the range |
| Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = callable(*first++); }
template<typename Creator, typename Deleter>
template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
creates a task to perform parallel transforms over two ranges of items
| Template parameters | |
|---|---|
| I1 | first input iterator type |
| I2 | second input iterator type |
| O | output iterator type |
| C | unary operator type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| Parameters | |
| first1 | iterator to the beginning of the input range |
| last1 | iterator to the end of the input range |
| first2 | iterato |
| output | iterator to the beginning of the output range |
| op | binary operator to apply to transform each pair of items in the two input ranges |
| Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }