debugging

This commit is contained in:
louisfd 2024-07-15 15:55:45 -04:00
parent 1675f5c297
commit 3f0f1f23fd
2 changed files with 27 additions and 15 deletions

View File

@ -54,6 +54,7 @@ pub(crate) fn compute_loop<F: Float, FC: Float>(
.rhs
.slice(shared_rhs_pos, shared_rhs_pos + num_tile_elems);
// cmma_computation(lhs_slice, rhs_slice, accumulate_slice);
cmma_row_major_mimic(lhs_slice, rhs_slice, accumulate_slice);
}
}
@ -422,13 +423,15 @@ pub mod tests {
3034496.0, 3042552.0, 3050608.0, 3058664.0, 3066720.0, 3074776.0, 3082832.0, 3090888.0,
3098944.0, 3107000.0, 3115056.0, 3123112.0, 3131168.0, 3139224.0, 3147280.0, 3155336.0,
];
assert_equals::<R>(results, expected, device);
assert_equals_range::<R>(results, expected, 768..1024, device);
}
/// Exported test
pub fn compute_loop_k_test<R: JitRuntime>(device: &R::Device) {
let lhs = range_tensor::<R>(16, 32, device);
let rhs = range_tensor::<R>(32, 16, device);
type FC1 = f32;
type FC2 = F32;
let lhs = range_tensor_generic::<FC1, R>(16, 32, device);
let rhs = range_tensor_generic::<FC1, R>(32, 16, device);
let results = create_empty::<R>(16, 16, device);
let cube_dim = CubeDim::new(1, 32, 1);
let cube_count = CubeCount::Static(1, 1, 1);
@ -447,16 +450,16 @@ pub mod tests {
unroll: false,
};
compute_loop_test::launch::<F32, F32, R>(
compute_loop_test::launch::<F32, FC2, R>(
lhs.client.clone(),
cube_count,
cube_dim,
TensorArg::new(&lhs.handle, &lhs.strides, &lhs.shape.dims),
TensorArg::new(&rhs.handle, &rhs.strides, &rhs.shape.dims),
ArrayArg::new(&results, 256),
UInt::new(16),
UInt::new(64),
UInt::new(32),
UInt::new(16),
UInt::new(64),
config,
);
@ -499,8 +502,11 @@ pub mod tests {
/// Exported test
pub fn compute_loop_warp_test<R: JitRuntime>(device: &R::Device) {
let lhs = range_tensor::<R>(16, 32, device);
let rhs = range_tensor::<R>(32, 32, device);
type FC1 = f32;
type FC2 = F32;
let lhs = range_tensor_generic::<FC1, R>(16, 32, device);
let rhs = range_tensor_generic::<FC1, R>(32, 32, device);
let results = create_empty::<R>(16, 32, device);
let cube_dim = CubeDim::new(1, 32, 1);
let cube_count = CubeCount::Static(1, 1, 1);
@ -519,16 +525,16 @@ pub mod tests {
unroll: false,
};
compute_loop_test::launch::<F32, F32, R>(
compute_loop_test::launch::<F32, FC2, R>(
lhs.client.clone(),
cube_count,
cube_dim,
TensorArg::new(&lhs.handle, &lhs.strides, &lhs.shape.dims),
TensorArg::new(&rhs.handle, &rhs.strides, &rhs.shape.dims),
ArrayArg::new(&results, 512),
UInt::new(16),
UInt::new(32),
UInt::new(64),
UInt::new(32),
UInt::new(64),
config,
);

View File

@ -105,6 +105,11 @@ pub mod tests {
k: UInt::new(0),
};
let out_vec = Comptime::vectorization(out);
for i in range(0u32, (k * n)/Comptime::runtime(out_vec), Comptime::new(false)) {
out[i] = F::vectorized(0., Comptime::get(out_vec));
}
let mut accumulate = SharedMemory::<F>::new(4096);
for i in range(0u32, 4096u32, Comptime::new(false)) {
accumulate[i] = acc_sm_arr[i];
@ -123,7 +128,8 @@ pub mod tests {
pub fn cmma_write_output_unit_test<R: JitRuntime>(device: &R::Device) {
let k = 16;
let n = 32;
let out = zeros_tensor::<R>(k, n, device);
// TODO should be zeros_tensor, rather than range then put back to 0, but fails on cuda
let out = range_tensor::<R>(k, n, device);
let acc_sm = range_tensor::<R>(64, 64, device);
let cube_dim = CubeDim::new(1, 1, 1);
let cube_count: CubeCount<R::Server> = CubeCount::Static(1, 1, 1);
@ -193,7 +199,7 @@ pub mod tests {
pub fn cmma_write_output_warp_test<R: JitRuntime>(device: &R::Device) {
let k = 16;
let n = 32;
let out = zeros_tensor::<R>(k, n, device);
let out = range_tensor::<R>(k, n, device);
let acc_sm = range_tensor::<R>(64, 64, device);
let cube_dim = CubeDim::new(1, 32, 1);
let cube_count: CubeCount<R::Server> = CubeCount::Static(1, 1, 1);
@ -274,7 +280,7 @@ pub mod tests {
pub fn cmma_write_output_second_warp_test<R: JitRuntime>(device: &R::Device) {
let k = 16;
let n = 64;
let out = zeros_tensor::<R>(k, n, device);
let out = range_tensor::<R>(k, n, device);
let acc_sm = range_tensor::<R>(64, 64, device);
let cube_dim = CubeDim::new(2, 32, 1);
let cube_count: CubeCount<R::Server> = CubeCount::Static(1, 1, 1);
@ -398,7 +404,7 @@ pub mod tests {
pub fn cmma_write_output_third_fourth_warps_test<R: JitRuntime>(device: &R::Device) {
let k = 32;
let n = 64;
let out = zeros_tensor::<R>(k, n, device);
let out = range_tensor::<R>(k, n, device);
let acc_sm = range_tensor::<R>(64, 64, device);
let cube_dim = CubeDim::new(4, 32, 1);
let cube_count: CubeCount<R::Server> = CubeCount::Static(1, 1, 1);