第18章 SIMD向量化与AI推理引擎¶
📚 章节概述¶
本章深入讲解SIMD指令集(SSE/AVX/NEON)编程和TensorRT C++ API推理引擎开发。掌握底层向量化优化和工业级模型部署的核心技术。
学习时间:5-7天 难度等级:⭐⭐⭐⭐⭐ 前置知识:C++基础、模板编程、计算机体系结构
📎 交叉引用: - C++基础 → 01-C++基础语法 - 现代C++ → 14-现代C++(17-23)新特性 - Effective C++ → 11-Effective C++基础
18.1 SIMD基础概念¶
18.1.1 什么是SIMD¶
SISD (传统标量):
指令: ADD a[0], b[0] → ADD a[1], b[1] → ADD a[2], b[2] → ...
每条指令处理1个元素,4个元素需要4条指令
SIMD (单指令多数据):
指令: VADD [a[0],a[1],a[2],a[3]], [b[0],b[1],b[2],b[3]]
一条指令同时处理4个(或更多)元素
SIMD = Single Instruction, Multiple Data
18.1.2 x86 SIMD指令集演进¶
| 指令集 | 年份 | 寄存器宽度 | 关键特性 |
|---|---|---|---|
| SSE | 1999 | 128-bit (XMM) | 4×float |
| SSE2 | 2001 | 128-bit | 2×double, 整数运算 |
| SSE4.¼.2 | 2007 | 128-bit | blend, dot product |
| AVX | 2011 | 256-bit (YMM) | 8×float |
| AVX2 | 2013 | 256-bit | FMA, 整数256位 |
| AVX-512 | 2017 | 512-bit (ZMM) | 16×float, mask操作 |
ARM SIMD: | 指令集 | 寄存器 | 特性 | |--------|--------|------| | NEON | 128-bit | 移动端标配(ARM v7/v8) | | SVE/SVE2 | 可变宽(128-2048) | 服务器ARM(Graviton3) |
📝 面试考点:AVX2和AVX-512的区别?FMA(Fused Multiply-Add)的计算优势?
18.2 SSE/AVX Intrinsics编程¶
18.2.1 数据类型与命名规则¶
// SIMD数据类型
__m128 // 128-bit, 4×float
__m128d // 128-bit, 2×double
__m128i // 128-bit, 整数(8/16/32/64)
__m256 // 256-bit, 8×float
__m256d // 256-bit, 4×double
__m256i // 256-bit, 整数
__m512 // 512-bit, 16×float (AVX-512)
// 命名规则: _mm<width>_<operation>_<type>
// width: 空(128), 256, 512
// type: ps(packed single), pd(packed double),
// epi32(packed int32), si128(128-bit integer)
// 示例
_mm256_add_ps // 256-bit float加法
_mm256_mul_ps // 256-bit float乘法
_mm256_fmadd_ps // 256-bit fused multiply-add: a*b+c
18.2.2 内存对齐与数据加载¶
#include <immintrin.h>
#include <cstdlib>
#include <cstdio>
int main() {
// ===== 内存对齐分配 =====
// AVX要求32字节对齐, AVX-512要求64字节对齐
float* a = (float*)aligned_alloc(32, 8 * sizeof(float));
float* b = (float*)aligned_alloc(32, 8 * sizeof(float));
float* c = (float*)aligned_alloc(32, 8 * sizeof(float));
for (int i = 0; i < 8; i++) {
a[i] = (float)i;
b[i] = (float)(i * 2);
}
// ===== 对齐加载 vs 非对齐加载 =====
__m256 va = _mm256_load_ps(a); // 对齐加载(地址必须32字节对齐)
__m256 vb = _mm256_load_ps(b); // 性能最优
// __m256 vu = _mm256_loadu_ps(ptr); // 非对齐加载(安全但可能慢)
// ===== 基本运算 =====
__m256 vc = _mm256_add_ps(va, vb); // c = a + b
__m256 vd = _mm256_mul_ps(va, vb); // d = a * b
__m256 ve = _mm256_fmadd_ps(va, vb, vc); // e = a*b + c (FMA)
// ===== 存储结果 =====
_mm256_store_ps(c, vc);
for (int i = 0; i < 8; i++) {
printf("c[%d] = %.1f\n", i, c[i]);
}
free(a); free(b); free(c);
return 0;
}
编译命令:
# GCC/Clang
g++ -mavx2 -mfma -O2 simd_basic.cpp -o simd_basic
# MSVC
cl /arch:AVX2 /O2 simd_basic.cpp
18.2.3 SIMD向量化的向量点积¶
#include <immintrin.h>
// 标量版本
float dot_product_scalar(const float* a, const float* b, int n) {
float sum = 0.0f;
for (int i = 0; i < n; i++) {
sum += a[i] * b[i];
}
return sum;
}
// AVX2 SIMD版本
float dot_product_avx2(const float* a, const float* b, int n) {
__m256 sum_vec = _mm256_setzero_ps(); // 8个float的累加器
int i = 0;
// 主循环: 每次处理8个float
for (; i + 7 < n; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
sum_vec = _mm256_fmadd_ps(va, vb, sum_vec); // sum += a*b
}
// 水平求和: 将8个float归约为1个
// [s0,s1,s2,s3,s4,s5,s6,s7]
__m128 hi = _mm256_extractf128_ps(sum_vec, 1); // [s4,s5,s6,s7]
__m128 lo = _mm256_castps256_ps128(sum_vec); // [s0,s1,s2,s3]
__m128 sum128 = _mm_add_ps(lo, hi); // [s0+s4,s1+s5,s2+s6,s3+s7]
sum128 = _mm_hadd_ps(sum128, sum128); // 水平加
sum128 = _mm_hadd_ps(sum128, sum128);
float result = _mm_cvtss_f32(sum128);
// 处理尾部元素
for (; i < n; i++) {
result += a[i] * b[i];
}
return result;
}
📝 面试考点:SIMD水平求和(horizontal sum)是什么?为什么它是SIMD的性能瓶颈?
18.3 SIMD矩阵乘法优化¶
18.3.1 Naive GEMM vs SIMD GEMM¶
#include <immintrin.h>
#include <cstring>
#include <chrono>
#include <cstdio>
// ===== Naive GEMM: C = A × B =====
void gemm_naive(const float* A, const float* B, float* C,
int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
// ===== AVX2 GEMM: 向量化内循环 =====
void gemm_avx2(const float* A, const float* B, float* C,
int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j += 8) { // 每次处理8列
__m256 c_vec = _mm256_setzero_ps();
for (int k = 0; k < K; k++) {
// 广播A[i][k]到8个lane
__m256 a_broadcast = _mm256_set1_ps(A[i * K + k]);
// 加载B[k][j:j+8]
__m256 b_vec = _mm256_loadu_ps(&B[k * N + j]);
// FMA: c += a * b
c_vec = _mm256_fmadd_ps(a_broadcast, b_vec, c_vec);
}
_mm256_storeu_ps(&C[i * N + j], c_vec);
}
}
}
// ===== 分块(Tiling) + AVX2 GEMM =====
void gemm_tiled_avx2(const float* A, const float* B, float* C,
int M, int N, int K) {
constexpr int TILE_M = 4;
constexpr int TILE_N = 16; // 2个AVX寄存器
constexpr int TILE_K = 64;
memset(C, 0, M * N * sizeof(float));
for (int i0 = 0; i0 < M; i0 += TILE_M) {
for (int k0 = 0; k0 < K; k0 += TILE_K) {
for (int j0 = 0; j0 < N; j0 += TILE_N) {
int i_end = std::min(i0 + TILE_M, M);
int k_end = std::min(k0 + TILE_K, K);
int j_end = std::min(j0 + TILE_N, N);
for (int i = i0; i < i_end; i++) {
// 加载C的当前块到寄存器
__m256 c0 = _mm256_loadu_ps(&C[i * N + j0]);
__m256 c1 = _mm256_loadu_ps(&C[i * N + j0 + 8]);
for (int k = k0; k < k_end; k++) {
__m256 a_val = _mm256_set1_ps(A[i * K + k]);
__m256 b0 = _mm256_loadu_ps(&B[k * N + j0]);
__m256 b1 = _mm256_loadu_ps(&B[k * N + j0 + 8]);
c0 = _mm256_fmadd_ps(a_val, b0, c0);
c1 = _mm256_fmadd_ps(a_val, b1, c1);
}
_mm256_storeu_ps(&C[i * N + j0], c0);
_mm256_storeu_ps(&C[i * N + j0 + 8], c1);
}
}
}
}
}
// ===== 性能对比 =====
int main() {
constexpr int M = 512, N = 512, K = 512;
float* A = (float*)aligned_alloc(32, M * K * sizeof(float));
float* B = (float*)aligned_alloc(32, K * N * sizeof(float));
float* C_naive = (float*)aligned_alloc(32, M * N * sizeof(float));
float* C_avx = (float*)aligned_alloc(32, M * N * sizeof(float));
float* C_tiled = (float*)aligned_alloc(32, M * N * sizeof(float));
// 初始化
for (int i = 0; i < M * K; i++) A[i] = (float)(i % 100) / 100.0f;
for (int i = 0; i < K * N; i++) B[i] = (float)(i % 100) / 100.0f;
auto bench = [](auto fn, const float* A, const float* B, float* C,
int M, int N, int K, const char* name) {
auto start = std::chrono::high_resolution_clock::now();
fn(A, B, C, M, N, K);
auto end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(end - start).count();
double gflops = 2.0 * M * N * K / (ms * 1e6);
printf("%s: %.2f ms, %.2f GFLOPS\n", name, ms, gflops);
};
bench(gemm_naive, A, B, C_naive, M, N, K, "Naive");
bench(gemm_avx2, A, B, C_avx, M, N, K, "AVX2");
bench(gemm_tiled_avx2, A, B, C_tiled, M, N, K, "Tiled+AVX2");
free(A); free(B); free(C_naive); free(C_avx); free(C_tiled);
return 0;
}
📝 面试考点:GEMM中Tiling(分块)优化的原理?为什么Cache局部性对GEMM性能影响很大?
18.4 编译器自动向量化¶
18.4.1 自动向量化条件¶
// ✅ 编译器可以自动向量化的循环
void add_arrays(float* c, const float* a, const float* b, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i]; // 简单逐元素操作
}
}
// ❌ 无法自动向量化的情况
// 1. 循环依赖
void cumsum(float* a, int n) {
for (int i = 1; i < n; i++) {
a[i] += a[i-1]; // 依赖前一个元素
}
}
// 2. 条件分支
void conditional(float* c, const float* a, int n) {
for (int i = 0; i < n; i++) {
if (a[i] > 0) c[i] = a[i];
else c[i] = -a[i];
// 可用 _mm256_blendv_ps 手动向量化
}
}
// 3. 函数调用
void with_call(float* c, const float* a, int n) {
for (int i = 0; i < n; i++) {
c[i] = my_func(a[i]); // 未内联的函数调用
}
}
18.4.2 辅助编译器向量化¶
// 使用#pragma提示编译器
#pragma GCC ivdep // 告诉GCC忽略依赖(程序员保证无依赖)
#pragma clang loop vectorize(enable)
#pragma omp simd // OpenMP SIMD指令
// __restrict__ 消除指针别名
void add_restrict(float* __restrict__ c,
const float* __restrict__ a,
const float* __restrict__ b, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
# 查看编译器向量化报告
g++ -O2 -mavx2 -fopt-info-vec-all code.cpp # GCC
clang++ -O2 -mavx2 -Rpass=loop-vectorize code.cpp # Clang
📝 面试考点:编译器自动向量化的限制条件有哪些?__restrict__关键字的作用?
18.5 ARM NEON编程¶
18.5.1 NEON基础¶
#include <arm_neon.h>
// NEON数据类型
// float32x4_t : 4个float32
// int32x4_t : 4个int32
// uint8x16_t : 16个uint8
// float16x8_t : 8个float16 (ARMv8.2-A)
// NEON向量点积
float dot_product_neon(const float* a, const float* b, int n) {
float32x4_t sum_vec = vdupq_n_f32(0.0f);
int i = 0;
for (; i + 3 < n; i += 4) {
float32x4_t va = vld1q_f32(a + i);
float32x4_t vb = vld1q_f32(b + i);
sum_vec = vfmaq_f32(sum_vec, va, vb); // FMA
}
// 水平求和: NEON有专门的指令
float result = vaddvq_f32(sum_vec); // ARMv8.1: 直接水平加
for (; i < n; i++) {
result += a[i] * b[i];
}
return result;
}
18.5.2 NEON量化计算(INT8推理)¶
#include <arm_neon.h>
// INT8矩阵乘法(量化推理核心)
// C[M×N] = A[M×K] × B[K×N],结果为int32防止int8溢出
// 算法:广播A元素,向量化沿B的列方向(N维度)
void gemm_int8_neon(const int8_t* A, const int8_t* B, int32_t* C,
int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j += 8) {
// 8路int32累加器,对应C[i][j:j+8]
int32x4_t sum0 = vdupq_n_s32(0); // C[i][j+0..j+3]
int32x4_t sum1 = vdupq_n_s32(0); // C[i][j+4..j+7]
for (int k = 0; k < K; k++) {
// 广播 A[i][k] 为int16向量(标量→8路复制)
int16x8_t va = vdupq_n_s16((int16_t)A[i * K + k]);
// 加载 B[k][j:j+8](B的第k行,第j~j+7列)
int8x8_t vb = vld1_s8(B + k * N + j);
// int8 → int16 扩展(防止乘法溢出)
int16x8_t vb_wide = vmovl_s8(vb);
// int16 × int16 → int32 乘加累积
// sum0 += A[i][k] * B[k][j+0..j+3]
sum0 = vmlal_s16(sum0, vget_low_s16(va), vget_low_s16(vb_wide));
// sum1 += A[i][k] * B[k][j+4..j+7]
sum1 = vmlal_s16(sum1, vget_high_s16(va), vget_high_s16(vb_wide));
}
// 数学验证:sum0[x] = Σ_k A[i][k]·B[k][j+x] = C[i][j+x] ✓
vst1q_s32(C + i * N + j, sum0);
vst1q_s32(C + i * N + j + 4, sum1);
}
}
}
⚠️ 性能优化提示:上面的实现为教学版本,每次循环仅处理1个k值。 生产环境可展开k循环(k+=4),或使用ARMv8.2-A的
vdotq_s32点积指令一次计算4个int8的点积,吞吐量提升约4倍。📝 面试考点:NEON和AVX的主要区别?移动端为什么INT8量化特别重要?
18.6 TensorRT C++ API 基础¶
18.6.1 TensorRT推理流程¶
TensorRT推理Pipeline:
PyTorch模型
↓ torch.onnx.export()
ONNX模型
↓ TensorRT Parser
Network Definition → Builder → Optimization
↓ ├── Layer Fusion
↓ ├── Kernel Selection
↓ ├── Precision Calibration
↓ └── Memory Optimization
Serialized Engine (.engine / .plan)
↓ Deserialize
Runtime Engine
↓ createExecutionContext()
Execution Context → enqueueV3() → 推理结果
18.6.2 核心API类¶
TensorRT核心类:
├── ILogger 日志接口(必须实现)
├── IBuilder 构建器(创建Network和Engine)
│ ├── createNetworkV2()
│ └── buildSerializedNetwork()
├── INetworkDefinition 网络定义
│ ├── addInput()
│ ├── addConvolutionNd()
│ ├── addActivation()
│ └── markOutput()
├── IBuilderConfig 构建配置
│ ├── setMemoryPoolLimit()
│ ├── setFlag(BuilderFlag::kFP16)
│ └── setFlag(BuilderFlag::kINT8)
├── IRuntime 运行时
│ └── deserializeCudaEngine()
├── ICudaEngine 序列化引擎
│ └── createExecutionContext()
└── IExecutionContext 执行上下文
└── enqueueV3()
18.6.3 完整TensorRT C++ 推理代码¶
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>
#include <vector>
#include <memory>
using namespace nvinfer1;
// ===== 1. 日志类(必须实现) =====
class Logger : public ILogger {
public:
void log(Severity severity, const char* msg) noexcept override {
if (severity <= Severity::kWARNING) {
std::cout << "[TRT] " << msg << std::endl;
}
}
};
// ===== 智能指针删除器 =====
struct TRTDeleter {
template <typename T>
void operator()(T* obj) const {
delete obj;
}
};
template <typename T>
using TRTUniquePtr = std::unique_ptr<T, TRTDeleter>;
// ===== 2. 从ONNX构建TensorRT Engine =====
std::vector<char> buildEngine(const std::string& onnx_path,
Logger& logger,
bool use_fp16 = true) {
// 创建Builder
auto builder = TRTUniquePtr<IBuilder>(createInferBuilder(logger));
// 创建Network (显式batch)
const auto explicitBatch = 1U <<
static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = TRTUniquePtr<INetworkDefinition>(
builder->createNetworkV2(explicitBatch));
// 创建ONNX Parser
auto parser = TRTUniquePtr<nvonnxparser::IParser>(
nvonnxparser::createParser(*network, logger));
// 解析ONNX模型
if (!parser->parseFromFile(onnx_path.c_str(),
static_cast<int>(ILogger::Severity::kWARNING))) {
std::cerr << "Failed to parse ONNX file: " << onnx_path << std::endl;
return {};
}
// 构建配置
auto config = TRTUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1ULL << 30); // 1GB
if (use_fp16 && builder->platformHasFastFp16()) {
config->setFlag(BuilderFlag::kFP16);
std::cout << "FP16 mode enabled" << std::endl;
}
// 构建序列化引擎
auto serialized = TRTUniquePtr<IHostMemory>(
builder->buildSerializedNetwork(*network, *config));
if (!serialized) {
std::cerr << "Failed to build engine" << std::endl;
return {};
}
// 返回序列化数据
std::vector<char> engine_data(
static_cast<char*>(serialized->data()),
static_cast<char*>(serialized->data()) + serialized->size());
return engine_data;
}
// ===== 3. 保存/加载Engine =====
void saveEngine(const std::vector<char>& engine_data,
const std::string& path) {
std::ofstream file(path, std::ios::binary);
file.write(engine_data.data(), engine_data.size());
}
std::vector<char> loadEngine(const std::string& path) {
std::ifstream file(path, std::ios::binary | std::ios::ate);
size_t size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> data(size);
file.read(data.data(), size);
return data;
}
// ===== 4. 推理类 =====
class TRTInference {
public:
TRTInference(const std::vector<char>& engine_data, Logger& logger) {
runtime_.reset(createInferRuntime(logger));
engine_.reset(runtime_->deserializeCudaEngine(
engine_data.data(), engine_data.size()));
context_.reset(engine_->createExecutionContext());
cudaStreamCreate(&stream_);
// 分配GPU内存
allocateBuffers();
}
~TRTInference() {
for (auto& buf : device_buffers_) {
cudaFree(buf);
}
cudaStreamDestroy(stream_);
}
// 执行推理
std::vector<float> infer(const std::vector<float>& input) {
// 拷贝输入到GPU
cudaMemcpyAsync(device_buffers_[input_idx_], input.data(),
input.size() * sizeof(float),
cudaMemcpyHostToDevice, stream_);
// 设置tensor地址
for (int i = 0; i < engine_->getNbIOTensors(); i++) {
const char* name = engine_->getIOTensorName(i);
context_->setTensorAddress(name, device_buffers_[i]);
}
// 执行推理
context_->enqueueV3(stream_);
// 拷贝输出到CPU
std::vector<float> output(output_size_);
cudaMemcpyAsync(output.data(), device_buffers_[output_idx_],
output_size_ * sizeof(float),
cudaMemcpyDeviceToHost, stream_);
cudaStreamSynchronize(stream_);
return output;
}
private:
void allocateBuffers() {
int nb_tensors = engine_->getNbIOTensors();
device_buffers_.resize(nb_tensors);
for (int i = 0; i < nb_tensors; i++) {
const char* name = engine_->getIOTensorName(i);
auto dims = engine_->getTensorShape(name);
auto dtype = engine_->getTensorDataType(name);
size_t vol = 1;
for (int d = 0; d < dims.nbDims; d++) vol *= dims.d[d];
size_t bytes = vol * sizeof(float); // 简化:假设float
cudaMalloc(&device_buffers_[i], bytes);
if (engine_->getTensorIOMode(name) == TensorIOMode::kINPUT) {
input_idx_ = i;
} else {
output_idx_ = i;
output_size_ = vol;
}
}
}
TRTUniquePtr<IRuntime> runtime_;
TRTUniquePtr<ICudaEngine> engine_;
TRTUniquePtr<IExecutionContext> context_;
cudaStream_t stream_;
std::vector<void*> device_buffers_;
int input_idx_ = 0, output_idx_ = 1;
size_t output_size_ = 0;
};
// ===== 5. 完整使用流程 =====
int main() {
Logger logger;
// 构建Engine(首次)
std::string engine_path = "model.engine";
std::vector<char> engine_data;
std::ifstream check(engine_path);
if (check.good()) {
std::cout << "Loading cached engine..." << std::endl;
engine_data = loadEngine(engine_path);
} else {
std::cout << "Building engine from ONNX..." << std::endl;
engine_data = buildEngine("model.onnx", logger, true);
saveEngine(engine_data, engine_path);
}
// 创建推理器
TRTInference infer(engine_data, logger);
// 准备输入 (假设 [1, 3, 224, 224])
std::vector<float> input(1 * 3 * 224 * 224, 1.0f);
// 推理
auto output = infer.infer(input);
std::cout << "Output size: " << output.size() << std::endl;
std::cout << "Top-1 class: "
<< std::distance(output.begin(),
std::max_element(output.begin(), output.end()))
<< std::endl;
return 0;
}
编译命令:
g++ -std=c++17 -O2 \
-I/usr/include/x86_64-linux-gnu \
-L/usr/lib/x86_64-linux-gnu \
trt_infer.cpp -o trt_infer \
-lnvinfer -lnvonnxparser -lcudart
📝 面试考点:TensorRT构建Engine的流程?为什么Engine是设备相关的(不可跨GPU使用)?
18.7 TensorRT INT8量化校准¶
18.7.1 量化原理¶
量化映射:
FP32 value → INT8 value
公式: q = clamp(round(x / scale), -128, 127)
反量化: x ≈ q × scale
对称量化 vs 非对称量化:
对称: scale = max(|x|) / 127, zero_point = 0
非对称: scale = (max-min) / 255, zero_point = round(-min/scale)
校准(Calibration): 用代表性数据确定每一层的量化参数(scale)
MinMax: scale = max(|activation|) / 127
Entropy: 最小化KL散度选scale
Percentile: 去掉异常值(如99.99%)
18.7.2 INT8校准器实现¶
#include <NvInfer.h>
#include <vector>
#include <string>
#include <fstream>
class Int8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
public:
Int8Calibrator(int batch_size, int input_h, int input_w,
const std::vector<std::string>& image_files,
const std::string& cache_file)
: batch_size_(batch_size),
input_h_(input_h), input_w_(input_w),
image_files_(image_files),
cache_file_(cache_file),
current_batch_(0) {
input_count_ = batch_size * 3 * input_h * input_w;
cudaMalloc(&device_input_, input_count_ * sizeof(float));
}
~Int8Calibrator() {
cudaFree(device_input_);
}
int getBatchSize() const noexcept override {
return batch_size_;
}
bool getBatch(void* bindings[], const char* names[],
int nbBindings) noexcept override {
if (current_batch_ * batch_size_ >= (int)image_files_.size()) {
return false; // 校准完成
}
// 加载一个batch的图片并预处理
std::vector<float> host_data(input_count_);
for (int i = 0; i < batch_size_; i++) {
int idx = current_batch_ * batch_size_ + i;
if (idx < (int)image_files_.size()) {
loadAndPreprocess(image_files_[idx],
host_data.data() + i * 3 * input_h_ * input_w_,
input_h_, input_w_);
}
}
cudaMemcpy(device_input_, host_data.data(),
input_count_ * sizeof(float), cudaMemcpyHostToDevice);
bindings[0] = device_input_;
current_batch_++;
return true;
}
// 读取/写入校准缓存
const void* readCalibrationCache(size_t& length) noexcept override {
cache_.clear();
std::ifstream input(cache_file_, std::ios::binary);
if (input.good()) {
input.seekg(0, std::ios::end);
length = input.tellg();
input.seekg(0, std::ios::beg);
cache_.resize(length);
input.read(cache_.data(), length);
return cache_.data();
}
return nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) noexcept override {
std::ofstream output(cache_file_, std::ios::binary);
output.write(static_cast<const char*>(cache), length);
}
private:
void loadAndPreprocess(const std::string& path, float* buffer,
int h, int w) {
// 实际项目中用OpenCV读取并resize、normalize
// 这里简化为随机数据
for (int i = 0; i < 3 * h * w; i++) {
buffer[i] = static_cast<float>(rand()) / RAND_MAX;
}
}
int batch_size_;
int input_h_, input_w_;
int input_count_;
std::vector<std::string> image_files_;
std::string cache_file_;
int current_batch_;
void* device_input_;
std::vector<char> cache_;
};
// 使用INT8校准构建Engine
std::vector<char> buildInt8Engine(const std::string& onnx_path,
Logger& logger) {
auto builder = TRTUniquePtr<IBuilder>(createInferBuilder(logger));
auto network = TRTUniquePtr<INetworkDefinition>(
builder->createNetworkV2(1U << static_cast<uint32_t>(
NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
auto parser = TRTUniquePtr<nvonnxparser::IParser>(
nvonnxparser::createParser(*network, logger));
parser->parseFromFile(onnx_path.c_str(),
static_cast<int>(ILogger::Severity::kWARNING));
auto config = TRTUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1ULL << 30);
config->setFlag(BuilderFlag::kINT8);
// 准备校准图片
std::vector<std::string> calib_images;
for (int i = 0; i < 500; i++) {
calib_images.push_back("calib_data/img_" + std::to_string(i) + ".jpg");
}
auto calibrator = std::make_unique<Int8Calibrator>(
8, 224, 224, calib_images, "calibration.cache");
config->setInt8Calibrator(calibrator.get());
auto serialized = TRTUniquePtr<IHostMemory>(
builder->buildSerializedNetwork(*network, *config));
return std::vector<char>(
static_cast<char*>(serialized->data()),
static_cast<char*>(serialized->data()) + serialized->size());
}
📝 面试考点:INT8量化的校准(Calibration)是什么?Entropy校准和MinMax校准的区别?
18.8 TensorRT Dynamic Shape¶
18.8.1 动态尺寸配置¶
// 动态shape允许一个Engine支持多种输入尺寸
std::vector<char> buildDynamicEngine(const std::string& onnx_path,
Logger& logger) {
auto builder = TRTUniquePtr<IBuilder>(createInferBuilder(logger));
auto network = TRTUniquePtr<INetworkDefinition>(
builder->createNetworkV2(1U << static_cast<uint32_t>(
NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
auto parser = TRTUniquePtr<nvonnxparser::IParser>(
nvonnxparser::createParser(*network, logger));
parser->parseFromFile(onnx_path.c_str(),
static_cast<int>(ILogger::Severity::kWARNING));
auto config = TRTUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
config->setFlag(BuilderFlag::kFP16);
// 创建Optimization Profile
auto profile = builder->createOptimizationProfile();
// 设置输入的动态范围: [min, opt, max]
// 假设输入名称为"input", NCHW格式
profile->setDimensions("input", OptProfileSelector::kMIN,
Dims4{1, 3, 224, 224});
profile->setDimensions("input", OptProfileSelector::kOPT,
Dims4{8, 3, 224, 224}); // 优化目标batch
profile->setDimensions("input", OptProfileSelector::kMAX,
Dims4{32, 3, 224, 224});
config->addOptimizationProfile(profile);
auto serialized = TRTUniquePtr<IHostMemory>(
builder->buildSerializedNetwork(*network, *config));
return std::vector<char>(
static_cast<char*>(serialized->data()),
static_cast<char*>(serialized->data()) + serialized->size());
}
// 推理时设置实际shape
void inferDynamic(IExecutionContext* context, int actual_batch) {
// 设置实际输入shape
context->setInputShape("input", Dims4{actual_batch, 3, 224, 224});
// 分配对应大小的buffer并推理
// ...
}
📝 面试考点:TensorRT Dynamic Shape的工作原理?为什么需要设置min/opt/max三个维度?
18.9 TensorRT Plugin开发¶
18.9.1 自定义Plugin¶
#include <NvInfer.h>
#include <NvInferPlugin.h>
#include <vector>
#include <string>
#include <cstring>
using namespace nvinfer1;
// 自定义GELU Plugin
class GELUPlugin : public IPluginV2DynamicExt {
public:
GELUPlugin() = default;
// ===== 必须实现的接口 =====
// 返回plugin名称
const char* getPluginType() const noexcept override {
return "CustomGELU";
}
const char* getPluginVersion() const noexcept override {
return "1";
}
int getNbOutputs() const noexcept override {
return 1;
}
// 输出数据类型
DataType getOutputDataType(int index, const DataType* inputTypes,
int nbInputs) const noexcept override {
return inputTypes[0];
}
// 输出维度
DimsExprs getOutputDimensions(int outputIndex, const DimsExprs* inputs,
int nbInputs,
IExprBuilder& exprBuilder) noexcept override {
return inputs[0]; // GELU不改变shape
}
// 工作空间需求
size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs,
const PluginTensorDesc* outputs,
int nbOutputs) const noexcept override {
return 0;
}
// 核心: CUDA核函数执行
int enqueue(const PluginTensorDesc* inputDesc,
const PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) noexcept override {
int n = 1;
for (int i = 0; i < inputDesc[0].dims.nbDims; i++) {
n *= inputDesc[0].dims.d[i];
}
// 调用CUDA kernel
geluKernel(static_cast<const float*>(inputs[0]),
static_cast<float*>(outputs[0]),
n, stream);
return 0;
}
// 序列化
size_t getSerializationSize() const noexcept override { return 0; }
void serialize(void* buffer) const noexcept override {}
// 克隆
IPluginV2DynamicExt* clone() const noexcept override {
return new GELUPlugin();
}
// 配置
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) noexcept override {
return inOut[pos].type == DataType::kFLOAT &&
inOut[pos].format == TensorFormat::kLINEAR;
}
void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs,
const DynamicPluginTensorDesc* out,
int nbOutputs) noexcept override {}
int initialize() noexcept override { return 0; }
void terminate() noexcept override {}
void destroy() noexcept override { delete this; }
void setPluginNamespace(const char* ns) noexcept override { namespace_ = ns; }
const char* getPluginNamespace() const noexcept override { return namespace_.c_str(); }
private:
std::string namespace_;
// GELU CUDA kernel声明
void geluKernel(const float* input, float* output, int n, cudaStream_t stream);
};
18.9.2 GELU CUDA Kernel¶
// gelu_kernel.cu
#include <cuda_runtime.h>
#include <cmath>
__global__ void gelu_kernel(const float* input, float* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float x = input[idx];
// GELU(x) = x * 0.5 * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)));
output[idx] = x * cdf;
}
}
void GELUPlugin::geluKernel(const float* input, float* output,
int n, cudaStream_t stream) {
int block = 256;
int grid = (n + block - 1) / block;
gelu_kernel<<<grid, block, 0, stream>>>(input, output, n);
}
📝 面试考点:什么情况下需要开发TensorRT Plugin?Plugin的生命周期是怎样的?
18.10 实战:ResNet50 PyTorch → ONNX → TensorRT¶
18.10.1 PyTorch导出ONNX¶
import torch
import torchvision
# 加载预训练ResNet50
model = torchvision.models.resnet50(pretrained=True)
model.eval()
# 导出ONNX
dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(
model, dummy_input, "resnet50.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"}
},
opset_version=13
)
print("Exported resnet50.onnx")
18.10.2 C++ TensorRT推理¶
// resnet50_trt.cpp
#include "trt_inference.h" // 复用前面的TRTInference类
#include <opencv2/opencv.hpp>
// 图片预处理
std::vector<float> preprocess(const std::string& image_path,
int h = 224, int w = 224) {
cv::Mat img = cv::imread(image_path);
cv::resize(img, img, cv::Size(w, h));
img.convertTo(img, CV_32F, 1.0 / 255.0);
// ImageNet归一化
cv::Scalar mean(0.485, 0.456, 0.406);
cv::Scalar std(0.229, 0.224, 0.225);
cv::subtract(img, mean, img);
cv::divide(img, std, img);
// HWC → CHW
std::vector<cv::Mat> channels(3);
cv::split(img, channels);
std::vector<float> result(3 * h * w);
for (int c = 0; c < 3; c++) {
memcpy(result.data() + c * h * w,
channels[c].data, h * w * sizeof(float));
}
return result;
}
int main(int argc, char* argv[]) {
Logger logger;
// 构建或加载Engine
auto engine_data = buildEngine("resnet50.onnx", logger, true);
saveEngine(engine_data, "resnet50_fp16.engine");
// 创建推理器
TRTInference infer(engine_data, logger);
// 推理
auto input = preprocess("test.jpg");
auto output = infer.infer(input);
// 输出Top-5
std::vector<std::pair<float, int>> scores;
for (int i = 0; i < (int)output.size(); i++) {
scores.emplace_back(output[i], i);
}
std::sort(scores.rbegin(), scores.rend());
std::cout << "\nTop-5 predictions:" << std::endl;
for (int i = 0; i < 5; i++) {
printf(" Class %d: %.4f\n", scores[i].second, scores[i].first);
}
return 0;
}
18.10.3 性能对比¶
ResNet50推理性能 (GPU: A100, batch=1):
| 模式 | 延迟(ms) | 吞吐(fps) | 显存(MB) |
|--------------|---------|----------|---------|
| PyTorch FP32 | 5.2 | 192 | 420 |
| PyTorch FP16 | 3.1 | 322 | 280 |
| TRT FP32 | 1.8 | 555 | 180 |
| TRT FP16 | 0.9 | 1111 | 120 |
| TRT INT8 | 0.5 | 2000 | 85 |
加速比:
TRT FP16 vs PyTorch FP32: 5.8x
TRT INT8 vs PyTorch FP32: 10.4x
📝 面试考点:TensorRT相比原生PyTorch推理为什么能加速这么多?主要优化了哪些方面?
18.11 面试高频题¶
Q1: SIMD的FMA(Fused Multiply-Add)指令有什么优势?¶
答:FMA将乘法和加法融合为一条指令 d = a*b + c,优势有三:(1)性能:一个时钟周期完成原需两条指令的操作,理论吞吐翻倍;(2)精度:中间结果不截断,只在最终做一次舍入,减少浮点误差累积;(3)功耗:减少指令发射和寄存器读写。FMA是矩阵乘法、卷积、点积等AI计算的核心指令。
Q2: 为什么GEMM中的Tiling(分块)如此重要?¶
答:GEMM的朴素实现对B矩阵的访问是按列跳跃的,Cache miss率极高。分块将大矩阵切成小块,每个小块能放入L1/L2 Cache,在块内完成计算后再处理下一个块。这样数据复用率从O(1)提升到O(T)(T为块大小),Cache miss率降低几十倍。结合SIMD(块内向量化计算),可以达到接近硬件峰值的FLOPS。
Q3: TensorRT为什么比PyTorch推理快很多?¶
答:TensorRT做了多层优化:(1)层融合(Layer Fusion):将Conv+BN+ReLU融合为一个kernel,减少内存读写和kernel启动开销;(2)精度校准:支持FP16/INT8自动降精度,吞吐翻倍;(3)Kernel自动调优(AutoTuning):对每层尝试多种CUDA kernel实现,选择最快的;(4)内存优化:张量复用、显存池化;(5)图优化:消除冗余层、常量折叠。
Q4: TensorRT Engine为什么不能跨GPU使用?¶
答:TensorRT在构建Engine时会根据具体GPU的计算能力(SM数量、核心频率)、可用显存、CUDA Compute Capability等硬件参数做kernel选择和内存规划。相同架构不同型号(如A100 40G vs 80G)可能选择不同的kernel。因此Engine是设备特定的,需要在目标GPU上重新构建。通常做法是在部署机器上第一次构建并缓存Engine文件。
Q5: INT8量化校准中Entropy和MinMax方法的区别?¶
答:MinMax简单直接,用 scale = max(|x|) / 127,保留全部数据范围但对异常值敏感。Entropy基于KL散度,寻找一个threshold使量化后的分布与原始分布最接近,能更好地处理有长尾分布的激活值。实践中Entropy通常精度更好(尤其是激活值分布不均匀时),但耗时更长。TensorRT默认用IInt8EntropyCalibrator2。
Q6: NEON和AVX的核心区别?¶
答:(1)寄存器宽度:NEON 128-bit(4×float),AVX 256-bit(8×float),AVX-512 512-bit(16×float);(2)架构:NEON是ARM指令集的一部分,AVX是x86扩展;(3)浮点行为:NEON的FP16运算原生支持(ARMv8.2),x86需要AVX-512 FP16;(4)生态:桌面/服务器用AVX,移动端/嵌入式用NEON。两者intrinsics API风格也不同。
Q7: 编译器自动向量化有哪些限制?¶
答:以下情况编译器通常无法自动向量化:(1)循环携带依赖:如累加a[i] += a[i-1];(2)复杂控制流:循环体内有复杂if-else分支;(3)函数调用:调用未内联的函数;(4)指针别名:编译器无法确定指针不重叠(用__restrict__解决);(5)非连续内存访问:Gather/Scatter模式。因此关键代码路径通常需要手写SIMD intrinsics。
Q8: TensorRT Dynamic Shape的min/opt/max分别代表什么?¶
答:min是支持的最小输入尺寸,max是最大输入尺寸,opt是TensorRT做kernel autotuning时的目标尺寸。Engine会为opt尺寸选择最快的kernel,其他尺寸虽可运行但可能不是最优。如果推理时实际batch size波动大,可以创建多个Optimization Profile,每个针对不同的常用batch size优化。
18.12 实践练习¶
练习1: SIMD优化¶
- 用AVX2实现向量的L2范数计算
- 用AVX2实现ReLU激活函数(
max(0, x)) - 对比标量版本和SIMD版本的性能
练习2: TensorRT部署¶
- 将YOLOv8导出为ONNX并用TensorRT加速
- 实现INT8校准并比较FP32/FP16/INT8的精度和速度
- 用Dynamic Shape支持不同分辨率输入
18.13 本章小结¶
核心知识点¶
| 概念 | 要点 |
|---|---|
| SIMD | 一条指令处理多个数据,SSE(128)/AVX(256)/AVX-512(512) |
| FMA | 融合乘加,性能翻倍+精度提升 |
| GEMM优化 | Tiling(Cache局部性) + SIMD(向量化) + FMA |
| TensorRT | ONNX→Engine→推理,层融合+精度校准+kernel调优 |
| INT8量化 | 校准器确定scale,Entropy > MinMax精度 |
| Dynamic Shape | min/opt/max三组,opt用于kernel选择 |
| Plugin | 自定义op的CUDA实现,集成到TensorRT图优化 |
技术选型¶
推理场景决策:
├── 服务器GPU → TensorRT (性能最强)
├── 服务器CPU → OpenVINO / ONNX Runtime
├── 移动端Android → NCNN(NEON优化) / TFLite
├── 移动端iOS → Core ML / NCNN
└── 边缘设备 → TensorRT(Jetson) / ONNX Runtime
恭喜完成第18章! 🎉