22 #include <unordered_map>
26 #include <driver_types.h>
31 using Duration = std::chrono::high_resolution_clock::duration;
43 static std::shared_ptr<CudaRTCFunction>
Compile(
44 const std::string& name,
45 const std::string& source);
49 const std::array<size_t, 3>& grid,
50 const std::array<size_t, 3>& block,
51 unsigned int shared_mem,
56 std::vector<int> params,
57 std::vector<void*> outputs,
58 std::vector<const void*> inputs,
59 bool profile =
false)
const;
Duration Launch(const std::array< size_t, 3 > &grid, const std::array< size_t, 3 > &block, unsigned int shared_mem, cudaStream_t stream, std::vector< int > params, std::vector< void * > outputs, std::vector< const void * > inputs, bool profile=false) const
std::vector< char > nvrtc_ptx
Definition: rtc.h:67
std::string specializedName
Definition: rtc.h:66
std::unordered_map< size_t, CUmodule > perGpuModule_
Definition: rtc.h:64
std::chrono::high_resolution_clock::duration Duration
Definition: rtc.h:31
std::unordered_map< size_t, CUfunction > perGpuKernel_
Definition: rtc.h:65
static std::shared_ptr< CudaRTCFunction > Compile(const std::string &name, const std::string &source)
bool cleared_
Definition: rtc.h:68