TensorRT Engine Deployment Guide
Overview
Overview
Building and using a TensorRT engine for inference involves two distinct phases: build time (optimization) and runtime (execution).
- Build time (optimization and compilation)
- Load model typically from format like ONNX.
- Parse model, understand network structure and weights.
- Optimize model, including layer, tensor fusion, precision calibration, kernel auto-tunning.
- Serialize model into a single, executable engine file.
- Runtime (Execution)
- Create Runtime instance.
- Deserialize engine into an
ICudaEngine
object - Create context
IExecutionContext
from the engine, it manages a single inference session. - Prepare buffer for input and output.
- Execute inference.
- Retrieve results.
- Free resources
Runtime Deployment Steps
1. Create a Runtime Object
Create an instance of the TensorRT runtime. This object is the entry point for all inference operations and is responsible for managing the execution of engines. It's a key part of the TensorRT API and is a lightweight object designed to be quickly instantiated.
1nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
Or
1runtime = trt.Runtime(trt.Logger())
2. Deserialize the Engine
The engine file (.engine or .plan) is a serialized binary file. The runtime's job is to read the file and deserialize it back into an ICudaEngine
object. This process loads the pre-optimized network graph and weights into the GPU's memory.
1std::ifstream file("my_model.engine", std::ios::binary);
2file.seekg(0, file.end);
3size_t size = file.tellg();
4file.seekg(0, file.beg);
5char* engine_data = new char[size];
6file.read(engine_data, size);
7nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data, size);
1with open("my_model.engine", "rb") as f:
2 engine = runtime.deserialize_cuda_engine(f.read())
3. Create an Execution Context
The engine holds the model's core structure and weights, but to actually perform an inference run, need an execution context. This context is a lightweight object that holds the state for a single inference, including temporary GPU memory for intermediate tensors (activations). An engine can create multiple contexts, which is useful for running parallel inferences on the same model
1nvinfer1::IExecutionContext* context = engine->createExecutionContext();
1context = engine.create_execution_context()
4. Prepare Input and Output Buffers
Before running the model, must allocate memory on the GPU for the input and output. Copy input data from CPU to GPU's memory.
1void* buffers[2]; // Pointers for input and output
2cudaMalloc(&buffers[0], input_size);
3cudaMalloc(&buffers[1], output_size);
4cudaMemcpy(buffers[0], host_input_data, input_size, cudaMemcpyHostToDevice);
1d_input = cuda.mem_alloc(input_data.nbytes)
2d_output = cuda.mem_alloc(output_data.nbytes)
3cuda.memcpy_htod(d_input, input_data)
5. Run the Inference
With the context and buffers ready, can execute the model. The enqueueV2 or enqueueV3 method of the execution context is used to launch the optimized kernels on the GPU. The is an asynchronous operation, so it will return immediately without waiting for the computation to finish.
1context->executeV2(buffers);
1context.execute_v2(bindings=[int(d_input), int(d_output)])
6. Retrieve the Results
After the inference is complete, copy the results from GPU memory back to the CPU memory for post-processing and use. Use CUDA synchronization like cudaStreamSynchronize
to ensure the GPU computation has finished before reading the data.
1cudaMemcpy(host_output_data, buffers[1], output_size, cudaMemcpyDeviceToHost);
1cuda.memcpy_dtoh(output_data, d_output)
7. Free Allocated Memory, Destroy Objects, Contexts
Free the allocated memory for TensorRT engine, and destroy context and objects. This ensures GPU memory and resources are properly released, preventing memory leaks.
1cudaFree(buffers[0]);
2cudaFree(buffers[1]);
3
4context->destroy();
5engine->destroy();
6runtime->destroy();
1d_input.free()
2d_output.free()
3
4context.destroy()
5engine.destroy()
6runtime.destroy()
Example Code
Below is an example loading a pre-built TensorRT engine and running inference.
1#include <iostream>
2#include <fstream>
3#include <vector>
4#include <string>
5#include <numeric>
6#include <cuda_runtime.h>
7
8#include "NvInfer.h"
9#include "NvOnnxParser.h"
10
11// Custom TensorRT logger to catch warnings and errors
12class Logger : public nvinfer1::ILogger {
13public:
14 void log(Severity severity, const char* msg) noexcept override {
15 if (severity <= Severity::kINFO) { // Adjust severity level as needed
16 std::cout << msg << std::endl;
17 }
18 }
19};
20
21// --- Helper Functions for CUDA Memory Management ---
22void* safeCudaMalloc(size_t memSize) {
23 void* deviceMem;
24 cudaMalloc(&deviceMem, memSize);
25 if (deviceMem == nullptr) {
26 std::cerr << "CUDA memory allocation failed." << std::endl;
27 exit(EXIT_FAILURE);
28 }
29 return deviceMem;
30}
31
32void checkCudaErrors(cudaError_t err) {
33 if (err != cudaSuccess) {
34 std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
35 exit(EXIT_FAILURE);
36 }
37}
38
39// --- Main Function for Inference ---
40int main() {
41 Logger logger;
42
43 // 1. Create a TensorRT Runtime object
44 nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
45 if (!runtime) {
46 std::cerr << "Failed to create IRuntime." << std::endl;
47 return -1;
48 }
49
50 // 2. Load and deserialize the engine file
51 std::string engine_path = "model.engine";
52 std::ifstream file(engine_path, std::ios::binary);
53 if (!file.good()) {
54 std::cerr << "Error opening engine file: " << engine_path << std::endl;
55 return -1;
56 }
57
58 file.seekg(0, std::ios::end);
59 size_t size = file.tellg();
60 file.seekg(0, std::ios::beg);
61 std::vector<char> engine_data(size);
62 file.read(engine_data.data(), size);
63
64 nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), size);
65 if (!engine) {
66 std::cerr << "Failed to deserialize engine." << std::endl;
67 return -1;
68 }
69
70 // 3. Create an Execution Context
71 nvinfer1::IExecutionContext* context = engine->createExecutionContext();
72 if (!context) {
73 std::cerr << "Failed to create IExecutionContext." << std::endl;
74 return -1;
75 }
76
77 // 4. Prepare host and device memory buffers
78 // Assume a single-input, single-output network with known dimensions
79 const int input_idx = engine->getBindingIndex("input_tensor");
80 const int output_idx = engine->getBindingIndex("output_tensor");
81
82 nvinfer1::Dims input_dims = engine->getBindingDimensions(input_idx);
83 size_t input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, engine->getBindingDataType(input_idx) == nvinfer1::DataType::kFLOAT ? sizeof(float) : 1, std::multiplies<size_t>()) * input_dims.d[0];
84
85 nvinfer1::Dims output_dims = engine->getBindingDimensions(output_idx);
86 size_t output_size = std::accumulate(output_dims.d + 1, output_dims.d + output_dims.nbDims, engine->getBindingDataType(output_idx) == nvinfer1::DataType::kFLOAT ? sizeof(float) : 1, std::multiplies<size_t>()) * output_dims.d[0];
87
88 std::vector<float> host_input_data(input_size / sizeof(float));
89 std::vector<float> host_output_data(output_size / sizeof(float));
90
91 // Fill host_input_data with your actual input (e.g., pre-processed image)
92 // For demonstration, we'll use a dummy input
93 std::cout << "Preparing dummy input..." << std::endl;
94 for (size_t i = 0; i < host_input_data.size(); ++i) {
95 host_input_data[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
96 }
97
98 // Allocate GPU memory
99 void* device_buffers[2];
100 device_buffers[input_idx] = safeCudaMalloc(input_size);
101 device_buffers[output_idx] = safeCudaMalloc(output_size);
102
103 // Copy input data from host to device
104 checkCudaErrors(cudaMemcpy(device_buffers[input_idx], host_input_data.data(), input_size, cudaMemcpyHostToDevice));
105
106 // 5. Run the inference
107 std::cout << "Running inference..." << std::endl;
108 bool status = context->executeV2(device_buffers);
109 if (!status) {
110 std::cerr << "Inference execution failed." << std::endl;
111 return -1;
112 }
113
114 // 6. Retrieve the results
115 checkCudaErrors(cudaMemcpy(host_output_data.data(), device_buffers[output_idx], output_size, cudaMemcpyDeviceToHost));
116
117 // Print a few output values for verification
118 std::cout << "Inference successful. First 5 output values:" << std::endl;
119 for (int i = 0; i < 5; ++i) {
120 std::cout << host_output_data[i] << " ";
121 }
122 std::cout << std::endl;
123
124 // 7. Clean up and free resources
125 checkCudaErrors(cudaFree(device_buffers[input_idx]));
126 checkCudaErrors(cudaFree(device_buffers[output_idx]));
127
128 context->destroy();
129 engine->destroy();
130 runtime->destroy();
131
132 std::cout << "Cleaned up resources successfully." << std::endl;
133
134 return 0;
135}