04-并行计算原理¶

重要性：⭐⭐⭐⭐⭐ 实用度：⭐⭐⭐⭐⭐ 学习时间：2天 必须掌握：是

为什么学这一章？¶

并行计算是GPU发挥性能的核心。理解并行算法设计、同步机制和性能分析模型，才能写出真正高效的GPU程序，而不仅仅是"能跑"的代码。

学完这一章，你将能够： - ✅ 理解Amdahl定律和并行加速比 - ✅ 掌握归约、扫描等经典并行算法模式 - ✅ 理解GPU同步机制（Block内同步、原子操作） - ✅ 编写高效的并行归约CUDA程序

📖 核心概念¶

1. 并行计算基础定律¶

Text Only

┌─────────────────────────────────────────────────────────────┐
│              Amdahl定律                                      │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  加速比 S(N) = 1 / ((1-P) + P/N)                           │
│                                                             │
│  其中：P = 可并行化的比例                                   │
│        N = 并行处理器数量                                   │
│        1-P = 串行部分比例                                   │
│                                                             │
│  示例：程序90%可并行（P=0.9），1000核CPU                    │
│  S(1000) = 1 / (0.1 + 0.9/1000) ≈ 9.91x                  │
│  即使有1000个核，最多只能加速约10倍！                       │
│                                                             │
│  并行度P     N=4     N=64    N=1024    N=∞                │
│  50%         1.6x    2.0x    2.0x      2.0x                │
│  75%         2.3x    3.8x    4.0x      4.0x                │
│  90%         3.1x    8.8x    9.9x      10.0x               │
│  95%         3.5x    15.4x   19.6x     20.0x               │
│  99%         3.9x    39.3x   91.2x     100.0x              │
│                                                             │
│  关键启示：串行瓶颈决定了并行加速的上限                     │
│                                                             │
└─────────────────────────────────────────────────────────────┘

2. 并行算法模式¶

Text Only

┌─────────────────────────────────────────────────────────────┐
│              常见并行算法模式                                 │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  1. Map（映射）：每个元素独立变换                            │
│     f(x) = x * 2  → 完美并行，无依赖                       │
│                                                             │
│  2. Reduce（归约）：将数组归约为单个值                      │
│     sum([1,2,3,4]) → 需要多轮合并                          │
│     第1轮：(1+2), (3+4) → [3, 7]                          │
│     第2轮：(3+7)        → [10]                             │
│                                                             │
│  3. Scan（扫描/前缀和）：计算所有前缀的归约                 │
│     prefix_sum([1,2,3,4]) → [1,3,6,10]                    │
│                                                             │
│  4. Scatter/Gather：分散写/聚集读                           │
│  5. Stencil：邻域模式（卷积、滤波）                         │
│  6. Sort：并行排序（位排序、归并排序）                       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

3. 并行归约（Reduce）¶

这是GPU编程中最经典的并行算法之一：

CUDA

// parallel_reduce.cu - 并行归约求和
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

// 版本1：交错寻址（有分支分歧）
__global__ void reduce_v1(float *input, float *output, int n) {
    __shared__ float sdata[BLOCK_SIZE];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // 加载到共享内存
    sdata[tid] = (idx < n) ? input[idx] : 0.0f;
    __syncthreads();

    // 交错归约
    for (int stride = 1; stride < blockDim.x; stride *= 2) {
        if (tid % (2 * stride) == 0) {       // 分支分歧！
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

// 版本2：连续线程工作（减少分支分歧）
__global__ void reduce_v2(float *input, float *output, int n) {
    __shared__ float sdata[BLOCK_SIZE];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (idx < n) ? input[idx] : 0.0f;
    __syncthreads();

    // 让低编号线程工作，连续的线程在同一Warp
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

// 版本3：首次加载时做一次归约（减少线程空闲）
__global__ void reduce_v3(float *input, float *output, int n) {
    __shared__ float sdata[BLOCK_SIZE];
    int tid = threadIdx.x;
    int idx = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    // 每个线程加载两个元素并相加
    float sum = 0.0f;
    if (idx < n) sum += input[idx];
    if (idx + blockDim.x < n) sum += input[idx + blockDim.x];
    sdata[tid] = sum;
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

// CPU归约（验证用）
float cpu_reduce(float *data, int n) {
    float sum = 0.0f;
    for (int i = 0; i < n; i++) sum += data[i];
    return sum;
}

int main() {
    int n = 1 << 20;  // 1M 元素
    size_t size = n * sizeof(float);
    printf("元素数量: %d\n", n);

    // 分配内存
    float *h_input = (float *)malloc(size);
    for (int i = 0; i < n; i++) h_input[i] = 1.0f;  // 全1，结果应为n

    float *d_input, *d_output;
    int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, gridSize * sizeof(float));
    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    // CPU结果
    float cpu_result = cpu_reduce(h_input, n);
    printf("CPU结果: %.0f\n", cpu_result);

    // GPU版本2
    reduce_v2<<<gridSize, BLOCK_SIZE>>>(d_input, d_output, n);

    // 将Block结果拷回CPU做最终归约
    float *h_output = (float *)malloc(gridSize * sizeof(float));
    cudaMemcpy(h_output, d_output, gridSize * sizeof(float),
               cudaMemcpyDeviceToHost);
    float gpu_result = 0.0f;
    for (int i = 0; i < gridSize; i++) gpu_result += h_output[i];
    printf("GPU结果: %.0f\n", gpu_result);

    // 清理
    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    return 0;
}

Bash

nvcc parallel_reduce.cu -o parallel_reduce && ./parallel_reduce

4. Block内同步¶

CUDA

// synchronization.cu - 同步机制演示
#include <stdio.h>
#include <cuda_runtime.h>

// __syncthreads() - Block内所有线程的栅栏同步
__global__ void sync_demo(int *data) {
    __shared__ int shared_buf[256];
    int tid = threadIdx.x;

    // 阶段1：每个线程写入共享内存
    shared_buf[tid] = tid * 2;

    // 必须同步！确保所有线程都已写入
    __syncthreads();

    // 阶段2：读取其他线程写入的值
    // 如果没有__syncthreads，可能读到未写入的旧值
    int neighbor = (tid + 1) % blockDim.x;
    data[tid] = shared_buf[neighbor];
}

// 原子操作 - 全局内存的线程安全操作
__global__ void histogram(int *input, int *hist, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int bin = input[idx] % 16;  // 假设16个桶
        atomicAdd(&hist[bin], 1);   // 原子加，避免竞争
    }
}

// Warp级同步（CUDA 9.0+）
__global__ void warp_sync_demo(int *data) {
    int tid = threadIdx.x;
    int lane = tid % 32;  // Warp内的线程号

    int val = data[tid];

    // Warp内广播：lane 0的值传给所有线程
    val = __shfl_sync(0xffffffff, val, 0);

    // Warp内归约示例
    for (int offset = 16; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }

    if (lane == 0) {
        data[tid / 32] = val;  // Warp的归约结果
    }
}

int main() {
    printf("同步机制演示\n");
    printf("1. __syncthreads(): Block内栅栏同步\n");
    printf("2. atomicAdd(): 原子操作\n");
    printf("3. __shfl_sync(): Warp内通信\n");

    int *d_data;
    cudaMalloc(&d_data, 256 * sizeof(int));
    sync_demo<<<1, 256>>>(d_data);
    cudaDeviceSynchronize();
    printf("完成\n");
    cudaFree(d_data);
    return 0;
}

5. 并行前缀和（Scan）¶

CUDA

// prefix_sum.cu - Hillis-Steele并行前缀和
#include <stdio.h>
#include <cuda_runtime.h>

#define N 16

// Hillis-Steele算法（inclusive scan）
__global__ void hillis_steele_scan(int *input, int *output, int n) {
    __shared__ int temp[2][N];
    int tid = threadIdx.x;

    // 加载到共享内存
    temp[0][tid] = (tid < n) ? input[tid] : 0;
    __syncthreads();

    int parity = 0;
    for (int stride = 1; stride < n; stride *= 2) {
        if (tid >= stride) {
            temp[1 - parity][tid] = temp[parity][tid] + temp[parity][tid - stride];
        } else {
            temp[1 - parity][tid] = temp[parity][tid];
        }
        parity = 1 - parity;
        __syncthreads();
    }

    output[tid] = temp[parity][tid];
}

int main() {
    int h_input[N], h_output[N];
    for (int i = 0; i < N; i++) h_input[i] = 1;  // 全1

    int *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(int));
    cudaMalloc(&d_output, N * sizeof(int));
    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

    hillis_steele_scan<<<1, N>>>(d_input, d_output, N);
    cudaMemcpy(h_output, d_output, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("输入:       ");
    for (int i = 0; i < N; i++) printf("%2d ", h_input[i]);
    printf("\n前缀和结果: ");
    for (int i = 0; i < N; i++) printf("%2d ", h_output[i]);
    printf("\n");
    // 期望输出：1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

    cudaFree(d_input); cudaFree(d_output);
    return 0;
}

6. 并行矩阵乘法¶

CUDA

// matmul.cu - 朴素并行矩阵乘法
#include <stdio.h>
#include <cuda_runtime.h>

#define TILE_SIZE 16

// 朴素版本：每个线程计算结果矩阵的一个元素
__global__ void matmul_naive(float *A, float *B, float *C,
                              int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

// 分块版本（Tiled）：使用共享内存减少全局内存访问
__global__ void matmul_tiled(float *A, float *B, float *C,
                              int M, int N, int K) {
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
    float sum = 0.0f;

    // 分块迭代
    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
        // 协作加载tile到共享内存
        if (row < M && t * TILE_SIZE + threadIdx.x < K)
            tileA[threadIdx.y][threadIdx.x] = A[row * K + t * TILE_SIZE + threadIdx.x];
        else
            tileA[threadIdx.y][threadIdx.x] = 0.0f;

        if (t * TILE_SIZE + threadIdx.y < K && col < N)
            tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * N + col];
        else
            tileB[threadIdx.y][threadIdx.x] = 0.0f;

        __syncthreads();

        // 计算部分和
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }
        __syncthreads();
    }

    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

int main() {
    int M = 512, N = 512, K = 512;
    size_t sizeA = M * K * sizeof(float);
    size_t sizeB = K * N * sizeof(float);
    size_t sizeC = M * N * sizeof(float);

    float *h_A = (float *)malloc(sizeA);
    float *h_B = (float *)malloc(sizeB);
    float *h_C = (float *)malloc(sizeC);

    for (int i = 0; i < M * K; i++) h_A[i] = 1.0f;
    for (int i = 0; i < K * N; i++) h_B[i] = 1.0f;

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, sizeA);
    cudaMalloc(&d_B, sizeB);
    cudaMalloc(&d_C, sizeC);
    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE,
              (M + TILE_SIZE - 1) / TILE_SIZE);

    matmul_tiled<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);

    printf("C[0][0] = %.0f (期望 %d)\n", h_C[0], K);
    printf("矩阵乘法完成: %dx%d × %dx%d = %dx%d\n", M, K, K, N, M, N);

    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    return 0;
}

Bash

nvcc matmul.cu -o matmul && ./matmul  # &&前一个成功才执行后一个；||前一个失败才执行

💡 面试常见问题¶

Q1：什么是Amdahl定律？它对GPU编程有什么启示？¶

答：Amdahl定律指出并行加速比受限于程序中串行部分的比例：S(N)=1/((1-P)+P/N)。对GPU编程的启示：①数据传输（Host↔Device）是串行瓶颈，要尽量减少；②Kernel启动开销也是串行部分；③尽量让更多计算在GPU上完成，减少CPU-GPU交互。

Q2：并行归约的优化思路有哪些？¶

答：①避免分支分歧（让连续线程处理连续数据）；②首次加载时做初步归约（减少空闲线程）；③使用Warp Shuffle指令替代共享内存（最后32个元素时）；④展开循环减少同步开销；⑤使用协作组（Cooperative Groups）进行灵活同步。

Q3：__syncthreads()的作用和限制？¶

答：__syncthreads()是Block内的栅栏同步，确保同一Block的所有线程都到达该点后才继续。限制：①只能同步Block内线程，不能跨Block同步；②所有线程必须都能到达（不能放在条件分支中只让部分线程执行）；③使用不当会导致死锁。

Q4：为什么分块（Tiled）矩阵乘法比朴素版本快？¶

答：朴素版本每个线程独立访问全局内存读取A的一行和B的一列，全局内存延迟大。分块版本让Block内线程协作将数据块加载到共享内存（延迟低~100倍），然后从共享内存读取计算。数据复用：每个元素从全局内存读一次，被Block内多个线程共享使用。

Q5：原子操作的性能影响？如何减少原子操作的竞争？¶

答：原子操作保证线程安全但会序列化冲突访问，高竞争时严重降低性能。减少竞争的方法：①先在共享内存中做局部归约，再用原子操作更新全局内存；②使用私有直方图后合并；③利用Warp级原语（如__shfl_down_sync）在Warp内先归约。

📝 本章小结¶

Text Only

┌─────────────────────────────────────────────┐
│              本章核心知识点                    │
├─────────────────────────────────────────────┤
│                                             │
│  1. Amdahl定律：串行比例决定加速上限        │
│  2. Map/Reduce/Scan是核心并行模式          │
│  3. 并行归约需要多轮合并                    │
│  4. __syncthreads()同步Block内线程          │
│  5. atomicAdd等原子操作保证线程安全         │
│  6. 分块算法利用共享内存提升性能            │
│                                             │
└─────────────────────────────────────────────┘

下一章：05-内存模型与数据传输 - GPU内存层次与优化