uv安装triton | Zaire's Blog

配置

uv python install 3.10
uv venv --python 3.10

source .venv/bin/activate

# 1. 安装依赖
uv pip install --pre --index-url https://download.pytorch.org/whl/nightly pytorch-triton

# 2. 安装主程序
uv pip install --pre --index-url https://download.pytorch.org/whl/nightly/cu124 torch torchvision triton

验证脚本

import torch
import triton
import triton.language as tl

@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # 获取当前线程块的 program ID
    pid = tl.program_id(axis=0)
    # 计算当前线程处理的数据偏移量
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # 掩码：防止越界
    mask = offsets < n_elements
    
    # 加载数据
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    
    # 写回数据
    tl.store(output_ptr + offsets, output, mask=mask)

def add(x: torch.Tensor, y: torch.Tensor):
    output = torch.empty_like(x)
    n_elements = output.numel()
    
    # 设定 block 数量
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
    
    # 启动 kernel
    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    return output

# 测试
if __name__ == "__main__":
    if not torch.cuda.is_available():
        print("Error: CUDA not found.")
    else:
        torch.manual_seed(0)
        size = 98432
        x = torch.rand(size, device='cuda')
        y = torch.rand(size, device='cuda')
        
        # 运行 Triton Kernel
        output_triton = add(x, y)
        # 运行 PyTorch 标准加法
        output_torch = x + y
        
        # 比较结果
        if torch.allclose(output_triton, output_torch):
            print("✅ Success! Triton configuration works perfectly.")
        else:
            print("❌ Error: Results do not match.")

Comments

There are no comments yet.