[ ] https://github.com/Deep-Learning-Profiling-Tools/triton-viz
TRITON_INTERPRET=1 python test.py
+ breakpoint()
Custom triton kernels + torch.compile()
https://github.com/pytorch/pytorch/blob/0c8bb6f70c65b0a68fcb282cc1605c79ca5dabce/test/dynamo/test_triton_kernels.py#L628-L661
def test_triton_kernel_autotune(self, grad, dynamic, backend, grid_type):
def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
n_elements = output.numel()
def grid_fn(meta):
return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
if grid_type == 1:
grid = (n_elements,)
elif grid_type == 2:
grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
elif grid_type == 3:
grid = grid_fn
add_kernel_autotuned[grid](x, y, output, n_elements)
return output
t1 = torch.rand(256, device="cuda", requires_grad=grad)
t2 = torch.rand(256, device="cuda", requires_grad=grad)
output = torch.zeros_like(t1, requires_grad=grad)
torch_add = call_triton(t1, t2, output)
compiled_func = torch.compile(
call_triton, backend=backend, fullgraph=True, dynamic=dynamic
)
output2 = torch.zeros_like(t1, requires_grad=grad)
self.assertEqual(compiled_func(t1, t2, output2), torch_add)
[ ] https://github.com/unslothai/unsloth
[ ] ‣
[ ] https://github.com/stanford-futuredata/megablocks
[ ] Bounty
[ ] https://twitter.com/pommedeterre33/status/1681935636129873920
[ ]