TensorRT Engine Deployment Guide

Sep 13, 2025 · 6 min read · AI ·

Overview

Building and using a TensorRT engine for inference involves two distinct phases: build time (optimization) and runtime (execution).

Build time (optimization and compilation)
1. Load model typically from format like ONNX.
2. Parse model, understand network structure and weights.
3. Optimize model, including layer, tensor fusion, precision calibration, kernel auto-tunning.
4. Serialize model into a single, executable engine file.
Runtime (Execution)
1. Create Runtime instance.
2. Deserialize engine into an ICudaEngine object
3. Create context IExecutionContext from the engine, it manages a single inference session.
4. Prepare buffer for input and output.
5. Execute inference.
6. Retrieve results.
7. Free resources

Runtime Deployment Steps

1. Create a Runtime Object

Create an instance of the TensorRT runtime. This object is the entry point for all inference operations and is responsible for managing the execution of engines. It's a key part of the TensorRT API and is a lightweight object designed to be quickly instantiated.

1nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);

1runtime = trt.Runtime(trt.Logger())

2. Deserialize the Engine

The engine file (.engine or .plan) is a serialized binary file. The runtime's job is to read the file and deserialize it back into an ICudaEngine object. This process loads the pre-optimized network graph and weights into the GPU's memory.

1std::ifstream file("my_model.engine", std::ios::binary);
2file.seekg(0, file.end);
3size_t size = file.tellg();
4file.seekg(0, file.beg);
5char* engine_data = new char[size];
6file.read(engine_data, size);
7nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data, size);

1with open("my_model.engine", "rb") as f:
2    engine = runtime.deserialize_cuda_engine(f.read())

3. Create an Execution Context

The engine holds the model's core structure and weights, but to actually perform an inference run, need an execution context. This context is a lightweight object that holds the state for a single inference, including temporary GPU memory for intermediate tensors (activations). An engine can create multiple contexts, which is useful for running parallel inferences on the same model

1nvinfer1::IExecutionContext* context = engine->createExecutionContext();

1context = engine.create_execution_context()

4. Prepare Input and Output Buffers

Before running the model, must allocate memory on the GPU for the input and output. Copy input data from CPU to GPU's memory.

1void* buffers[2]; // Pointers for input and output
2cudaMalloc(&buffers[0], input_size);
3cudaMalloc(&buffers[1], output_size);
4cudaMemcpy(buffers[0], host_input_data, input_size, cudaMemcpyHostToDevice);

1d_input = cuda.mem_alloc(input_data.nbytes)
2d_output = cuda.mem_alloc(output_data.nbytes)
3cuda.memcpy_htod(d_input, input_data)

5. Run the Inference

With the context and buffers ready, can execute the model. The enqueueV2 or enqueueV3 method of the execution context is used to launch the optimized kernels on the GPU. The is an asynchronous operation, so it will return immediately without waiting for the computation to finish.

1context->executeV2(buffers);

1context.execute_v2(bindings=[int(d_input), int(d_output)])

6. Retrieve the Results

After the inference is complete, copy the results from GPU memory back to the CPU memory for post-processing and use. Use CUDA synchronization like cudaStreamSynchronize to ensure the GPU computation has finished before reading the data.

1cudaMemcpy(host_output_data, buffers[1], output_size, cudaMemcpyDeviceToHost);

1cuda.memcpy_dtoh(output_data, d_output)

7. Free Allocated Memory, Destroy Objects, Contexts

Free the allocated memory for TensorRT engine, and destroy context and objects. This ensures GPU memory and resources are properly released, preventing memory leaks.

1cudaFree(buffers[0]);
2cudaFree(buffers[1]);
3
4context->destroy();
5engine->destroy();
6runtime->destroy();

1d_input.free()
2d_output.free()
3
4context.destroy()
5engine.destroy()
6runtime.destroy()

Example Code

Below is an example loading a pre-built TensorRT engine and running inference.

  1#include <iostream>
  2#include <fstream>
  3#include <vector>
  4#include <string>
  5#include <numeric>
  6#include <cuda_runtime.h>
  7
  8#include "NvInfer.h"
  9#include "NvOnnxParser.h"
 10
 11// Custom TensorRT logger to catch warnings and errors
 12class Logger : public nvinfer1::ILogger {
 13public:
 14    void log(Severity severity, const char* msg) noexcept override {
 15        if (severity <= Severity::kINFO) { // Adjust severity level as needed
 16            std::cout << msg << std::endl;
 17        }
 18    }
 19};
 20
 21// --- Helper Functions for CUDA Memory Management ---
 22void* safeCudaMalloc(size_t memSize) {
 23    void* deviceMem;
 24    cudaMalloc(&deviceMem, memSize);
 25    if (deviceMem == nullptr) {
 26        std::cerr << "CUDA memory allocation failed." << std::endl;
 27        exit(EXIT_FAILURE);
 28    }
 29    return deviceMem;
 30}
 31
 32void checkCudaErrors(cudaError_t err) {
 33    if (err != cudaSuccess) {
 34        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
 35        exit(EXIT_FAILURE);
 36    }
 37}
 38
 39// --- Main Function for Inference ---
 40int main() {
 41    Logger logger;
 42
 43    // 1. Create a TensorRT Runtime object
 44    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
 45    if (!runtime) {
 46        std::cerr << "Failed to create IRuntime." << std::endl;
 47        return -1;
 48    }
 49
 50    // 2. Load and deserialize the engine file
 51    std::string engine_path = "model.engine";
 52    std::ifstream file(engine_path, std::ios::binary);
 53    if (!file.good()) {
 54        std::cerr << "Error opening engine file: " << engine_path << std::endl;
 55        return -1;
 56    }
 57
 58    file.seekg(0, std::ios::end);
 59    size_t size = file.tellg();
 60    file.seekg(0, std::ios::beg);
 61    std::vector<char> engine_data(size);
 62    file.read(engine_data.data(), size);
 63
 64    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), size);
 65    if (!engine) {
 66        std::cerr << "Failed to deserialize engine." << std::endl;
 67        return -1;
 68    }
 69
 70    // 3. Create an Execution Context
 71    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
 72    if (!context) {
 73        std::cerr << "Failed to create IExecutionContext." << std::endl;
 74        return -1;
 75    }
 76
 77    // 4. Prepare host and device memory buffers
 78    // Assume a single-input, single-output network with known dimensions
 79    const int input_idx = engine->getBindingIndex("input_tensor");
 80    const int output_idx = engine->getBindingIndex("output_tensor");
 81
 82    nvinfer1::Dims input_dims = engine->getBindingDimensions(input_idx);
 83    size_t input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, engine->getBindingDataType(input_idx) == nvinfer1::DataType::kFLOAT ? sizeof(float) : 1, std::multiplies<size_t>()) * input_dims.d[0];
 84
 85    nvinfer1::Dims output_dims = engine->getBindingDimensions(output_idx);
 86    size_t output_size = std::accumulate(output_dims.d + 1, output_dims.d + output_dims.nbDims, engine->getBindingDataType(output_idx) == nvinfer1::DataType::kFLOAT ? sizeof(float) : 1, std::multiplies<size_t>()) * output_dims.d[0];
 87
 88    std::vector<float> host_input_data(input_size / sizeof(float));
 89    std::vector<float> host_output_data(output_size / sizeof(float));
 90
 91    // Fill host_input_data with your actual input (e.g., pre-processed image)
 92    // For demonstration, we'll use a dummy input
 93    std::cout << "Preparing dummy input..." << std::endl;
 94    for (size_t i = 0; i < host_input_data.size(); ++i) {
 95        host_input_data[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
 96    }
 97
 98    // Allocate GPU memory
 99    void* device_buffers[2];
100    device_buffers[input_idx] = safeCudaMalloc(input_size);
101    device_buffers[output_idx] = safeCudaMalloc(output_size);
102
103    // Copy input data from host to device
104    checkCudaErrors(cudaMemcpy(device_buffers[input_idx], host_input_data.data(), input_size, cudaMemcpyHostToDevice));
105
106    // 5. Run the inference
107    std::cout << "Running inference..." << std::endl;
108    bool status = context->executeV2(device_buffers);
109    if (!status) {
110        std::cerr << "Inference execution failed." << std::endl;
111        return -1;
112    }
113
114    // 6. Retrieve the results
115    checkCudaErrors(cudaMemcpy(host_output_data.data(), device_buffers[output_idx], output_size, cudaMemcpyDeviceToHost));
116
117    // Print a few output values for verification
118    std::cout << "Inference successful. First 5 output values:" << std::endl;
119    for (int i = 0; i < 5; ++i) {
120        std::cout << host_output_data[i] << " ";
121    }
122    std::cout << std::endl;
123
124    // 7. Clean up and free resources
125    checkCudaErrors(cudaFree(device_buffers[input_idx]));
126    checkCudaErrors(cudaFree(device_buffers[output_idx]));
127
128    context->destroy();
129    engine->destroy();
130    runtime->destroy();
131
132    std::cout << "Cleaned up resources successfully." << std::endl;
133
134    return 0;
135}