配置
uv python install 3.10 uv venv --python 3.10 source .venv/bin/activate # 1. 安装依赖 uv pip install --pre --index-url https://download.pytorch.org/whl/nightly pytorch-triton # 2. 安装主程序 uv pip install --pre --index-url https://download.pytorch.org/whl/nightly/cu124 torch torchvision triton
验证脚本
import torch import triton import triton.language as tl @triton.jit def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): # 获取当前线程块的 program ID pid = tl.program_id(axis=0) # 计算当前线程处理的数据偏移量 block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) # 掩码:防止越界 mask = offsets < n_elements # 加载数据 x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y # 写回数据 tl.store(output_ptr + offsets, output, mask=mask) def add(x: torch.Tensor, y: torch.Tensor): output = torch.empty_like(x) n_elements = output.numel() # 设定 block 数量 grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) # 启动 kernel add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) return output # 测试 if __name__ == "__main__": if not torch.cuda.is_available(): print("Error: CUDA not found.") else: torch.manual_seed(0) size = 98432 x = torch.rand(size, device='cuda') y = torch.rand(size, device='cuda') # 运行 Triton Kernel output_triton = add(x, y) # 运行 PyTorch 标准加法 output_torch = x + y # 比较结果 if torch.allclose(output_triton, output_torch): print("✅ Success! Triton configuration works perfectly.") else: print("❌ Error: Results do not match.")