diff --git a/crates/burn-jit/src/kernel/matmul/tiling2d_cube/load_shared_memory.rs b/crates/burn-jit/src/kernel/matmul/tiling2d_cube/load_shared_memory.rs index 7ae3e49fa..2fb2dacbc 100644 --- a/crates/burn-jit/src/kernel/matmul/tiling2d_cube/load_shared_memory.rs +++ b/crates/burn-jit/src/kernel/matmul/tiling2d_cube/load_shared_memory.rs @@ -193,7 +193,9 @@ fn write_tile_transposed( } else { for i in range(0u32, Comptime::get(tile_size), unroll) { let mut transposed = F::vectorized(0., Comptime::get(tile_size)); - for j in range(0u32, Comptime::get(tile_size), unroll) { + + // Unrolling this one makes the difference + for j in range(0u32, Comptime::get(tile_size), Comptime::new(true)) { transposed[j] = tile[j][i]; }