1D-Convolution


最基本写法

#include <cuda_runtime.h>
 
__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    if (tidx > input_size - kernel_size) return;
    float temp = 0;
    #pragma unroll
    for (int i = 0; i < kernel_size; ++i) {
        temp += input[tidx + i] * kernel[i];
    }
    output[tidx] = temp;
}
 
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size, int kernel_size) {
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 256;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
 
    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size, kernel_size);
    cudaDeviceSynchronize();
}

向量化

#include <cuda_runtime.h>
 
__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
    int tidx = blockDim.x * blockIdx.x + threadIdx.x;
    if (tidx > input_size - kernel_size) return;
    float temp = 0.0f;
    #pragma unroll
    for (int i = 0; i < kernel_size / 4; i++) {
        int p = i * 4;
        float4 tempI = make_float4(input[tidx + p], input[tidx + p + 1], input[tidx + p + 2], input[tidx + p + 3]);
        float4 tempK = make_float4(kernel[p], kernel[p + 1], kernel[p + 2], kernel[p + 3]);
        temp += tempI.x * tempK.x + tempI.y * tempK.y + tempI.z * tempK.z + tempI.w * tempK.w;
    }
    // kernel_size & ~3 = (kernel_size / 4) * 4
    for (int i = kernel_size & ~3; i < kernel_size; i++) {
        float tempI = input[tidx + i];
        float tempK = kernel[i];
        temp += tempI * tempK;
    }
    output[tidx] = temp;
}
 
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size, int kernel_size) {
    if (kernel_size <= 0 || kernel_size > input_size) return;
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 1024;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size, kernel_size);
    cudaDeviceSynchronize();
}

Color-Inversion


基础写法

#include <cuda_runtime.h>
 
__global__ void invert_kernel(unsigned char *image, int width, int height) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= width * height)
        return;
    image[idx * 4] = 255 - image[idx * 4];
    image[idx * 4 + 1] = 255 - image[idx * 4 + 1];
    image[idx * 4 + 2] = 255 - image[idx * 4 + 2];
}
 
extern "C" void solve(unsigned char *image, int width, int height) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;
    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
    cudaDeviceSynchronize();
}

位运算

#include <cuda_runtime.h>
 
__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= width * height) return;
    // uint8 255 - x = x ^ 0xFF
    image[idx] ^= 0x00FFFFFFu;
}
 
extern "C" void solve(unsigned char* image, int width, int height) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;
    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
}

Leaky-ReLU


基础写法

#include <cuda_runtime.h>
 
__global__ void leaky_relu_kernel(float * x, int n, const float alpha) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) {
        float v = x[tid];
        x[tid] = v > 0.0f ? v : alpha * v;
    }
}
 
// x is a device pointer
extern "C" void solve(float* x, int n) {
    const int threadsPerBlock = 256;
    const int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    const float alpha = 0.01f;
    leaky_relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, n, alpha);
    cudaDeviceSynchronize();
}

向量化

#include <cuda_runtime.h>
 
__global__ void leaky_relu_kernel(float * x, int n, const float alpha) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int idx = tid * 4;
    if (idx + 3 < n) {
        float4 v = reinterpret_cast<float4*>(x)[tid];
        reinterpret_cast<float4*>(x)[tid] = make_float4(
            v.x > 0 ? v.x : alpha * v.x,
            v.y > 0 ? v.y : alpha * v.y,
            v.z > 0 ? v.z : alpha * v.z,
            v.w > 0 ? v.w : alpha * v.w
        );
    } else if (idx < n) {
        for (int i = idx; i < n; ++i) {
            x[i] = x[i] > 0 ? x[i] : alpha * x[i];
        }
    }
}
 
// x is a device pointer
extern "C" void solve(float* x, int n) {
    const int threadsPerBlock = 256;
    const int blocksPerGrid = ((n + 3) / 4 + threadsPerBlock - 1) / threadsPerBlock;
    const float alpha = 0.01f;
    leaky_relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, n, alpha);
    cudaDeviceSynchronize();
}

Matrix-Addition


基础写法

#include <cuda_runtime.h>
 
__global__ void matrix_add(const float* __restrict A, const float* __restrict B, float* C, int N) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N * N) {
        C[tid] = A[tid] + B[tid];
    }
}
 
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N * N + threadsPerBlock - 1) / threadsPerBlock;
    matrix_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}

向量化

#include <cuda_runtime.h>
 
__global__ void matrix_addition(const float *__restrict A, const float *__restrict B, float *C, int N) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid * 4 + 3 < N * N) {
        const float4 a4 = reinterpret_cast<const float4*>(A)[tid];
        const float4 b4 = reinterpret_cast<const float4*>(B)[tid];
        reinterpret_cast<float4*>(C)[tid] = make_float4(a4.x + b4.x, a4.y + b4.y, a4.z + b4.z, a4.w + b4.w);
    } else if (tid * 4 < N * N) {
        #pragma unroll
        for (int i = tid * 4; i < N * N; ++i) {
            C[i] = A[i] + B[i];
        }
    }
}
 
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int total_elements = N * N;
    int threadsPerBlock = 256;
    int blocksPerGrid = (((total_elements + 3) / 4) + threadsPerBlock - 1) / threadsPerBlock;
    matrix_addition<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}

Matrix-Copy


基础写法

#include <cuda_runtime.h>
 
__global__ void copy_matrix_kernel(const float* A, float* B, int N) {
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = N * N;
    if (tidx < total) {
        B[tidx] = A[tidx];
    }
}
 
extern "C" void solve(const float* A, float* B, int N) {
    int total = N * N;
    int threadsPerBlock = 256;
    int blocksPerGrid = (total + threadsPerBlock - 1) / threadsPerBlock;
 
    copy_matrix_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, N);
    cudaDeviceSynchronize();
}

官方写法

#include <cuda_runtime.h>
 
extern "C" void solve(const float* A, float* B, const int N) {
    cudaMemcpy(B, A, N * N * sizeof(float), cudaMemcpyDeviceToDevice);
}

Matrix-Multiplication


最基本写法
问题:GM多些次数过多

#include <cuda_runtime.h>
 
__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,
                                             int K) {
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;
    // MN * NK = MK
    if (tidy < M && tidx < K) {
        float sum = 0;
        for (int i = 0; i < N; ++i) {
            sum += A[tidy * N + i] * B[i * K + tidx];
        }
        C[tidy * K + tidx] = sum;
    }
}
 
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);
 
    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
    cudaDeviceSynchronize();
}

Tile块

#include <cuda_runtime.h>
 
template <int TILE_SIZE>
__global__ void matrix_multiplication_kernel(const float *__restrict__ A,
    const float *__restrict__ B, float *__restrict__ C,
    int M, int N, int K) {
    extern __shared__ float shared[];
    float *tileA = shared;
    float *tileB = shared + TILE_SIZE * TILE_SIZE;
    const int row = blockIdx.y * blockDim.y + threadIdx.y;
    const int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    // MN * NK
    for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {
        const int aCol = t * TILE_SIZE + threadIdx.x;
        const int bRow = t * TILE_SIZE + threadIdx.y;
        const int localIndex = threadIdx.y * TILE_SIZE + threadIdx.x;
        // 一个block所有thread都去搬数据, 搬满一个tile, 自己负责自己的小格
        tileA[localIndex] = (row < M && aCol < N) ? A[row * N + aCol] : 0.0f;
        tileB[localIndex] = (col < K && bRow < N) ? B[bRow * K + col] : 0.0f;
        // 必须有完整的tile数据才行, 所以block中thread同步
        __syncthreads();
        #pragma unroll
        // 每个thread利用tile中的数据, 计算自己当前小块的单步结果, 并累加
        for (int i = 0; i < TILE_SIZE; ++i) {
            sum += tileA[threadIdx.y * TILE_SIZE + i] *
                   tileB[i * TILE_SIZE + threadIdx.x];
        }
        __syncthreads();
    }
 
    if (row < M && col < K) {
        C[row * K + col] = sum;
    }
}
 
void solve(const float *A, const float *B, float *C, int M, int N, int K)
{
    constexpr int kTileSize = 16;
    dim3 threadsPerBlock(kTileSize, kTileSize);
    dim3 blocksPerGrid(
        (K + threadsPerBlock.x - 1) / threadsPerBlock.x,
        (M + threadsPerBlock.y - 1) / threadsPerBlock.y);
 
    const size_t sharedMemoryBytes =
        2 * kTileSize * kTileSize * sizeof(float);
 
    matrix_multiplication_kernel<kTileSize>
        <<<blocksPerGrid, threadsPerBlock, sharedMemoryBytes>>>(A, B, C, M, N, K);
    cudaDeviceSynchronize();
}

Matrix-Transpose


数学公式:不只是 block 之间的位置互换,block 内部的数据访问方向也发生了转置。

输入矩阵:

A  B
C  D

输出矩阵:

A^T  C^T
B^T  D^T

基础版

#include <cuda_runtime.h>
 
#define BLOCK_SIZE 32
 
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;
    if (tidy < rows && tidx < cols) {
        output[tidx * rows + tidy] = input[tidy * cols + tidx];
    }
}
 
// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocksPerGrid((cols + BLOCK_SIZE - 1) / BLOCK_SIZE,
                       (rows + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}

共享内存 + 访存合并

#include <cuda_runtime.h>
 
#define BLOCK_SIZE 16
 
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE];
 
    int tx = threadIdx.x;
    int ty = threadIdx.y;
 
    int tidy = blockIdx.y * blockDim.y + ty;
    int tidx = blockIdx.x * blockDim.x + tx;
 
    if (tidy < rows && tidx < cols) {
        tile[ty][tx] = input[tidy * cols + tidx];
    }
 
    __syncthreads();
 
    int out_tidy = blockIdx.x * blockDim.x + ty;
    int out_tidx = blockIdx.y * blockDim.y + tx;
 
    if (out_tidy < cols && out_tidx < rows) {
        output[out_tidy * rows + out_tidx] = tile[tx][ty];
    }
}
 
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
 
    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}

共享内存 + 访存合并 + Bank Conflict 处理

#include <cuda_runtime.h>
 
#define BLOCK_SIZE 16
 
__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    // +1 padding avoids bank conflicts when reading the tile in transposed order.
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE + 1];
 
    int tx = threadIdx.x;
    int ty = threadIdx.y;
 
    int tidy = blockIdx.y * blockDim.y + ty;
    int tidx = blockIdx.x * blockDim.x + tx;
 
    if (tidy < rows && tidx < cols) {
        tile[ty][tx] = input[tidy * cols + tidx];
    }
 
    __syncthreads();
 
    int out_tidy = blockIdx.x * blockDim.x + ty;
    int out_tidx = blockIdx.y * blockDim.y + tx;
 
    if (out_tidy < cols && out_tidx < rows) {
        output[out_tidy * rows + out_tidx] = tile[tx][ty];
    }
}
 
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
 
    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}

Rainbow-Table


基础版

#include <cuda_runtime.h>
 
__device__ __forceinline__ unsigned int fnv1a_hash(unsigned int x) {
    // 如果题面已经给了 fnv1a_hash,直接删掉这个实现,保留调用即可
    const unsigned int FNV_OFFSET_BASIS = 2166136261u;
    const unsigned int FNV_PRIME = 16777619u;
    unsigned int h = FNV_OFFSET_BASIS;
    h ^= (x & 0xFFu);
    h *= FNV_PRIME;
 
    h ^= ((x >> 8) & 0xFFu);
    h *= FNV_PRIME;
 
    h ^= ((x >> 16) & 0xFFu);
    h *= FNV_PRIME;
 
    h ^= ((x >> 24) & 0xFFu);
    h *= FNV_PRIME;
 
    return h;
}
 
__global__ void rainbow_table_kernel(
    const int *__restrict__ input,
    unsigned int *__restrict__ output,
    int N,
    int R) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    if (gid >= N)
        return;
    unsigned int value = static_cast<unsigned int>(input[gid]);
    while (R--) {
        value = fnv1a_hash(value);
    }
    output[gid] = value;
}
 
extern "C" void solve(const int* input, unsigned int* output, int N, int R) {
    constexpr int THREADS = 256;
    int blocks = (N + THREADS - 1) / THREADS;
 
    rainbow_table_kernel<<<blocks, THREADS>>>(input, output, N, R);
    cudaDeviceSynchronize();
}

ReLU


基础写法

#include <cuda_runtime.h>
 
// ReLU(x) = max(0, x)
__global__ void relu_kernel(float* x, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        x[tid] = fmaxf(x[tid], 0.0f);
    }
}
 
// x is a device pointer
extern "C" void solve(float* x, int n) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, n);
    cudaDeviceSynchronize();
}

向量化加载

#include <cuda_runtime.h>
 
// ReLU(x) = max(0, x)
__global__ void relu_kernel(float* x, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = tid * 4;
    if (idx + 3 < n) {
        float4* x4 = reinterpret_cast<float4*>(x);
        float4 v = x4[tid];
        v.x = fmaxf(v.x, 0.0f);
        v.y = fmaxf(v.y, 0.0f);
        v.z = fmaxf(v.z, 0.0f);
        v.w = fmaxf(v.w, 0.0f);
        x4[tid] = v;
    } else if (idx < n) {
        for (int i = idx; i < n; ++i) {
            x[i] = x[i] > 0.0f ? x[i] : 0.0f;
        }
    }
}
 
// x is a device pointer
extern "C" void solve(float* x, int n) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (((n + 3) / 4) + threadsPerBlock - 1) / threadsPerBlock;
    relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, n);
    cudaDeviceSynchronize();
}

softmax


最基础写法

#include <cuda_runtime.h>
#include <float.h>
#include <math.h>
 
// Row-wise softmax
// output[row, col] = exp(input[row, col] - row_max) / sum_j exp(input[row, j] - row_max)
__global__ void softmax_kernel_basic(const float* input,
                                     float* output,
                                     int num_rows,
                                     int num_cols) {
    int row = blockIdx.x;
    int col = threadIdx.x;
 
    if (row >= num_rows || col >= num_cols) return;
 
    // Step 1: find max value of this row
    float row_max = -FLT_MAX;
    for (int j = 0; j < num_cols; ++j) {
        row_max = fmaxf(row_max, input[row * num_cols + j]);
    }
 
    // Step 2: compute denominator
    float row_sum = 0.0f;
    for (int j = 0; j < num_cols; ++j) {
        row_sum += expf(input[row * num_cols + j] - row_max);
    }
 
    // Step 3: normalize
    output[row * num_cols + col] =
        expf(input[row * num_cols + col] - row_max) / row_sum;
}
 
extern "C" void solve(const float* input, float* output, int num_rows, int num_cols) {
    dim3 block_dim(num_cols);
    dim3 grid_dim(num_rows);
    softmax_kernel_basic<<<grid_dim, block_dim>>>(input, output, num_rows, num_cols);
    cudaDeviceSynchronize();
}
 
 
#include <cuda_runtime.h>
#include <float.h>
#include <math.h>
 
#ifndef THREADS_PER_BLOCK
#define THREADS_PER_BLOCK 256
#endif
 
#define FULL_MASK 0xffffffffu
 
__inline__ __device__ float warp_reduce_max(float value) {
    value = fmaxf(value, __shfl_down_sync(FULL_MASK, value, 16));
    value = fmaxf(value, __shfl_down_sync(FULL_MASK, value, 8));
    value = fmaxf(value, __shfl_down_sync(FULL_MASK, value, 4));
    value = fmaxf(value, __shfl_down_sync(FULL_MASK, value, 2));
    value = fmaxf(value, __shfl_down_sync(FULL_MASK, value, 1));
    return value;
}
 
__inline__ __device__ float warp_reduce_sum(float value) {
    // 从同一个 warp 里“更高编号 lane”的线程那里取一个寄存器值过来
    // warp 里的每个 thread 都会执行这几行代码 lane 0 的结果一定是整个 warp 的总和
    // 为什么不会有thread2先执行在thread0 因为wrap中是同步的 thread2和thread0同步执行
    // 可以理解成两步同时发生:tmp = lane(i+16) 在这一行开始前的 value value = 自己原来的 value + tmp
    value += __shfl_down_sync(FULL_MASK, value, 16);
    value += __shfl_down_sync(FULL_MASK, value, 8);
    value += __shfl_down_sync(FULL_MASK, value, 4);
    value += __shfl_down_sync(FULL_MASK, value, 2);
    value += __shfl_down_sync(FULL_MASK, value, 1);
    return value;
}
 
__global__ void softmax_kernel(const float* __restrict__ input,
                               float* __restrict__ output,
                               int num_rows,
                               int num_cols) {
    // 一个block一行 一行 THREADS_PER_BLOCK 个threads
    const int row_idx = blockIdx.x;
    const int thread_idx = threadIdx.x;
    const int lane_idx = thread_idx & 31;
    const int warp_idx = thread_idx >> 5;
    const int num_warps = THREADS_PER_BLOCK / 32;
 
    if (row_idx >= num_rows) return;
 
    __shared__ float warp_max_buffer[THREADS_PER_BLOCK / 32];
    __shared__ float warp_sum_buffer[THREADS_PER_BLOCK / 32];
 
    // Step 1: compute row maximum
    float thread_max = -FLT_MAX;
    // 寻找当前thread的最大值
    for (int col_idx = thread_idx; col_idx < num_cols; col_idx += THREADS_PER_BLOCK) {
        thread_max = fmaxf(thread_max, input[row_idx * num_cols + col_idx]);
    }
    // 针对当前wrap的最大值
    thread_max = warp_reduce_max(thread_max);
    if (lane_idx == 0) {
        warp_max_buffer[warp_idx] = thread_max;
    }
    __syncthreads();
 
    float row_max = -FLT_MAX;
    // 只有第 0 个 warp 的线程参与这一步归约。
    if (warp_idx == 0) {
        // 第 0 个 warp 的前 num_warps 个线程,各自读取一个 warp 的局部最大值
        row_max = (thread_idx < num_warps) ? warp_max_buffer[lane_idx] : -FLT_MAX;
        //  在第 0 个 warp 内继续做一次 warp 级最大值归约
        row_max = warp_reduce_max(row_max);
        // 让 block 中的第 0 号线程把最终结果写回共享内存 warp_max_buffer[0]
        if (thread_idx == 0) {
            warp_max_buffer[0] = row_max;
        }
    }
    __syncthreads();
    // 每个thread读取自己行的最大值
    row_max = warp_max_buffer[0];
 
    // Step 2: compute denominator
    // 每个线程先初始化自己的局部和。
    float thread_sum = 0.0f;
    // 当前线程以 blockDim.x 为步长,处理这一行中属于自己的若干列。
    for (int col_idx = thread_idx; col_idx < num_cols; col_idx += THREADS_PER_BLOCK) {
        thread_sum += expf(input[row_idx * num_cols + col_idx] - row_max);
    }
    // 先在每个 warp 内部做一次求和归约。
    thread_sum = warp_reduce_sum(thread_sum);
    //  每个 warp 只让 lane 0 把本 warp 的局部和写入共享内存
    if (lane_idx == 0) {
        warp_sum_buffer[warp_idx] = thread_sum;
    }
    __syncthreads();
    // 先把当前行的总和初始化为 0
    float row_sum = 0.0f;
    if (warp_idx == 0) {
        // 只有第 0 个 warp 参与这一步
        // 前面已经算出了“每个 warp 的局部和”,并把它们写到了 warp_sum_buffer[warp_idx] 中
        row_sum = (thread_idx < num_warps) ? warp_sum_buffer[lane_idx] : 0.0f;
        // 在第 0 个 warp 内部继续做一次 warp 级求和归约
        row_sum = warp_reduce_sum(row_sum);
        if (thread_idx == 0) {
            // 让 block 中的第 0 号线程把最终结果写回共享内存 warp_sum_buffer[0]
            warp_sum_buffer[0] = row_sum;
        }
    }
    __syncthreads();
    // 所有线程都从共享内存的 warp_sum_buffer[0] 中读取同一个最终结果
    row_sum = warp_sum_buffer[0];
 
    // Step 3: normalize
    for (int col_idx = thread_idx; col_idx < num_cols; col_idx += THREADS_PER_BLOCK) {
        // 每个线程继续按“跨步访问”的方式处理自己负责的若干
        // exp(input - row_max) / row_sum
        output[row_idx * num_cols + col_idx] =
            expf(input[row_idx * num_cols + col_idx] - row_max) / row_sum;
    }
}
 
extern "C" void solve(const float* input, float* output, int num_rows, int num_cols) {
    dim3 block_dim(THREADS_PER_BLOCK);
    dim3 grid_dim(num_rows);
    softmax_kernel<<<grid_dim, block_dim>>>(input, output, num_rows, num_cols);
    cudaDeviceSynchronize();
}

Reduction


#include <cuda_runtime.h>
 
template<int BS>
__global__ void reduction_kernel(const float* __restrict__ input, float* __restrict__ output, int N) {
    __shared__ float smem[BS];
    // 一个block中一次处理 2 * BS个元素 总共开了sm * 8个block
    const int blockStride = gridDim.x * (BS * 2);
    int tidx = blockIdx.x * (BS * 2) + threadIdx.x;
    int threadId = threadIdx.x;
    
    float threadSum = 0;
    for (int i = tidx; i < N; i += blockStride) {
        threadSum += input[i];
        if (i + BS < N) threadSum += input[i + BS];
    }
    smem[threadId] = threadSum;
    __syncthreads();
    #pragma unroll
    for (int stride = BS / 2; stride > 32; stride /= 2) {
        if (threadId < stride) {
            smem[threadId] += smem[threadId + stride];
        }
        __syncthreads();
    }
    if (threadId < 32) {
        float warpSum = smem[threadId];
        unsigned mask = 0xffffffffu;
        // 让当前线程去“拿同一个 warp 里、编号比自己大 offset 的那个线程的 x
        warpSum += __shfl_down_sync(mask, warpSum, 16);
        warpSum += __shfl_down_sync(mask, warpSum, 8);
        warpSum += __shfl_down_sync(mask, warpSum, 4);
        warpSum += __shfl_down_sync(mask, warpSum, 2);
        warpSum += __shfl_down_sync(mask, warpSum, 1);
        if (threadId == 0) atomicAdd(output, warpSum);
    }
}
 
extern "C" void solve(const float *input, float *output, int N) {
    int dev = 0, sm = 0;
    cudaGetDevice(&dev);
    cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, dev);
 
    constexpr int BS = 16;
    int naturalBlocks = (N + (BS * 2) - 1) / (BS * 2);
    if (naturalBlocks > sm * 8) naturalBlocks = sm * 8; 
    dim3 blocksPerGrid(naturalBlocks);
    dim3 threadsPerBlock(BS);
    cudaMemset(output, 0, sizeof(float));
    reduction_kernel<BS><<<blocksPerGrid, threadsPerBlock>>>(input, output, N);
}
 

共享内存 + __shfl_down

#include <cuda_runtime.h>
 
#define BLOCK_SIZE 1024
 
__global__ void reduction(const float *input, float *output, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ float sharedMem[BLOCK_SIZE];
 
    // 拷到共享内存(越界用 0),不能提前 return,下面有 __syncthreads
    float v = (idx < N) ? input[idx] : 0.0f;
    sharedMem[threadIdx.x] = v;
    __syncthreads();
 
    // 树形二分归约。还剩一个 warp(32 个线程)时停下。
    for (int stride = blockDim.x / 2; stride >= warpSize; stride /= 2) {
        if (threadIdx.x < stride) {
            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
        }
        __syncthreads();
    }
 
    // warp 内归约
    if (threadIdx.x < warpSize) {
        float sum = sharedMem[threadIdx.x];
        sum += __shfl_down_sync(0xffffffff, sum, 16);
        sum += __shfl_down_sync(0x0000ffff, sum, 8);
        sum += __shfl_down_sync(0x000000ff, sum, 4);
        sum += __shfl_down_sync(0x0000000f, sum, 2);
        sum += __shfl_down_sync(0x00000003, sum, 1);
        if (threadIdx.x == 0) {
            atomicAdd(output, sum);
        }
    }
}
 
// input, output are device pointers
extern "C" void solve(const float *input, float *output, int N) {
    reduction<<<(N + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(input, output, N);
}

基础写法(会有确定性问题,不要用!!!)

#include <cuda_runtime.h>
 
#define BLOCK_SIZE 1024
 
__global__ void reduction(const float* input, float* output, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        atomicAdd(output, input[idx]);
    }
}
 
extern "C" void solve(const float* input, float* output, int N) {
    reduction<<<(N + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(input, output, N);
}

Reverse-Array


基础写法

#include <cuda_runtime.h>
 
__global__ void reverse_array_inplace(float* a, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n / 2) {
        auto temp = a[tid];
        a[tid] = a[n - tid - 1];
        a[n - tid - 1] = temp;
    }
}
 
extern "C" void solve(float* a, int n) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (n / 2 + threadsPerBlock - 1) / threadsPerBlock;
    reverse_array_inplace<<<blocksPerGrid, threadsPerBlock>>>(a, n);
    cudaDeviceSynchronize();
}

Vector-Addition


最基本写法

#include <cuda_runtime.h>
 
__global__ void vector_add(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}
 
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
 
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}

向量化加载

#include <cuda_runtime.h>
 
__global__ void vector_add_vec4(const float* A, const float* B, float* C, int N) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
    int vecN = N / 4;
    const float4* A4 = reinterpret_cast<const float4*>(A);
    const float4* B4 = reinterpret_cast<const float4*>(B);
    float4* C4 = reinterpret_cast<float4*>(C);
 
    if (tid < vecN) {
        float4 a = A4[tid];
        float4 b = B4[tid];
        float4 c;
        c.x = a.x + b.x;
        c.y = a.y + b.y;
        c.z = a.z + b.z;
        c.w = a.w + b.w;
        C4[tid] = c;
    }
 
    int idx = vecN * 4 + tid;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}
 
// A, B, C are device pointers
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int vecN = (N + 3) / 4;
    int blocksPerGrid = (vecN + threadsPerBlock - 1) / threadsPerBlock;
    vector_add_vec4<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}