mirror of https://github.com/tracel-ai/burn.git
Perf/wgpu/reduce dim (#943)
* new reduce half working * surprisingly working * good on elongated matrix, bad on balanced ones * working and clean * autotune not tested, tests fail at non contiguous * fixed * autotune tested * mean dim * some fixes * clippy
This commit is contained in:
parent
f53ab06efc
commit
831335ac2e
|
@ -57,3 +57,7 @@ serial_test = "2.0.0"
|
|||
[[bench]]
|
||||
name = "matmul"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "reduction"
|
||||
harness = false
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
use burn_common::benchmark::{run_benchmark, Benchmark};
|
||||
use burn_tensor::backend::Backend;
|
||||
use burn_tensor::{Distribution, Shape, Tensor};
|
||||
use burn_wgpu::kernel::reduce::{init_reduce_output, sum_dim, sum_dim_shared_memory};
|
||||
use burn_wgpu::WgpuDevice;
|
||||
use burn_wgpu::{AutoGraphicsApi, Wgpu};
|
||||
use derive_new::new;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use burn_wgpu::GraphicsApi;
|
||||
|
||||
type WTensor<G, const D: usize> = Tensor<Wgpu<G, f32, i32>, D>;
|
||||
|
||||
#[derive(new)]
|
||||
struct ReduceBenchmark<B: Backend, F, const D: usize> {
|
||||
shape: Shape<D>,
|
||||
dim: usize,
|
||||
num_repeats: usize,
|
||||
device: B::Device,
|
||||
reduce: PhantomData<F>,
|
||||
}
|
||||
|
||||
trait ReduceFunction<G: GraphicsApi, const D: usize> {
|
||||
fn run(input: WTensor<G, D>, dim: usize) -> WTensor<G, D>;
|
||||
}
|
||||
|
||||
impl<F, const D: usize, G> Benchmark for ReduceBenchmark<Wgpu<G, f32, i32>, F, D>
|
||||
where
|
||||
F: ReduceFunction<G, D>,
|
||||
G: GraphicsApi,
|
||||
{
|
||||
type Args = WTensor<G, D>;
|
||||
|
||||
fn name(&self) -> String {
|
||||
format!(
|
||||
"{:?} {:?} dim={:?}",
|
||||
std::any::type_name::<F>(),
|
||||
self.shape.dims,
|
||||
self.dim
|
||||
)
|
||||
}
|
||||
|
||||
fn num_samples(&self) -> usize {
|
||||
10
|
||||
}
|
||||
|
||||
fn execute(&self, input: Self::Args) {
|
||||
for _ in 0..self.num_repeats {
|
||||
F::run(input.clone(), self.dim);
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare(&self) -> Self::Args {
|
||||
WTensor::random_device(self.shape.clone(), Distribution::Default, &self.device)
|
||||
}
|
||||
|
||||
fn sync(&self) {
|
||||
Wgpu::<G, f32, i32>::sync(&self.device)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! bench_reduce {
|
||||
($benchmark:ident, $reduce_name:ident, $func:expr) => {
|
||||
struct $reduce_name {}
|
||||
impl<G: GraphicsApi, const D: usize> ReduceFunction<G, D> for $reduce_name {
|
||||
fn run(input: WTensor<G, D>, dim: usize) -> WTensor<G, D> {
|
||||
let input = input.into_primitive();
|
||||
let output = init_reduce_output(&input, dim);
|
||||
Tensor::from_primitive($func(input, output, dim))
|
||||
}
|
||||
}
|
||||
type $benchmark<const D: usize> =
|
||||
ReduceBenchmark<Wgpu<AutoGraphicsApi, f32, i32>, $reduce_name, D>;
|
||||
};
|
||||
}
|
||||
|
||||
bench_reduce!(SumDimBenchmark, SumDim, sum_dim);
|
||||
bench_reduce!(
|
||||
SumDimSharedMemoryBenchmark,
|
||||
SumDimSharedMemory,
|
||||
sum_dim_shared_memory
|
||||
);
|
||||
|
||||
#[allow(dead_code)]
|
||||
/// Runs the benchmarks for wgpu matmul implementations
|
||||
pub fn bench(device: &WgpuDevice) {
|
||||
let num_repeats = 3;
|
||||
let shape = Shape::new([50, 8000, 50]);
|
||||
let dim = 1;
|
||||
|
||||
macro_rules! run_reduce_benchmark {
|
||||
($benchmark:ident) => {
|
||||
run_benchmark($benchmark::new(
|
||||
shape.clone(),
|
||||
dim,
|
||||
num_repeats,
|
||||
device.clone(),
|
||||
));
|
||||
};
|
||||
}
|
||||
|
||||
run_reduce_benchmark!(SumDimSharedMemoryBenchmark);
|
||||
run_reduce_benchmark!(SumDimBenchmark);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
bench(&WgpuDevice::BestAvailable)
|
||||
}
|
|
@ -2,19 +2,25 @@ use std::fmt::Display;
|
|||
|
||||
use burn_compute::tune::AutotuneKey;
|
||||
|
||||
use crate::kernel::matmul::MatmulAutotuneKey;
|
||||
use crate::kernel::{matmul::MatmulAutotuneKey, reduce::ReduceAutotuneKey};
|
||||
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone)]
|
||||
/// Key for all autotune-enabled operations
|
||||
pub enum WgpuAutotuneKey {
|
||||
/// Key for matmul operation
|
||||
Matmul(MatmulAutotuneKey),
|
||||
/// Key for sum_dim operations
|
||||
SumDim(ReduceAutotuneKey),
|
||||
/// Key for mean_dim operations
|
||||
MeanDim(ReduceAutotuneKey),
|
||||
}
|
||||
|
||||
impl Display for WgpuAutotuneKey {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
WgpuAutotuneKey::Matmul(matmul_key) => std::fmt::Display::fmt(&matmul_key, f),
|
||||
WgpuAutotuneKey::SumDim(reduce_key) => std::fmt::Display::fmt(&reduce_key, f),
|
||||
WgpuAutotuneKey::MeanDim(reduce_key) => std::fmt::Display::fmt(&reduce_key, f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,6 @@ mod clamp;
|
|||
mod comparison;
|
||||
mod index;
|
||||
mod mask;
|
||||
mod reduction;
|
||||
mod source;
|
||||
mod unary;
|
||||
mod unary_scalar;
|
||||
|
@ -26,10 +25,11 @@ pub mod matmul;
|
|||
pub mod pool;
|
||||
/// Pseudo-random number generator kernels
|
||||
pub mod prng;
|
||||
/// Reduction algorithms
|
||||
pub mod reduce;
|
||||
|
||||
pub(crate) use cat::*;
|
||||
pub(crate) use clamp::*;
|
||||
pub(crate) use comparison::*;
|
||||
pub(crate) use index::*;
|
||||
pub(crate) use mask::*;
|
||||
pub(crate) use reduction::*;
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
use crate::{element::WgpuElement, tensor::WgpuTensor};
|
||||
|
||||
/// Creates an empty output tensor with reduce output shape
|
||||
pub fn init_reduce_output<E: WgpuElement, const D: usize>(
|
||||
input: &WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
let mut shape_out = input.shape.clone();
|
||||
shape_out.dims[reduce_dim] = 1;
|
||||
|
||||
// Create output handle
|
||||
let num_elems_output = shape_out.num_elements();
|
||||
let handle = input
|
||||
.client
|
||||
.empty(num_elems_output * core::mem::size_of::<E>());
|
||||
WgpuTensor::new(
|
||||
input.client.clone(),
|
||||
input.device.clone(),
|
||||
shape_out.clone(),
|
||||
handle,
|
||||
)
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
mod base;
|
||||
mod reduction;
|
||||
mod reduction_shared_memory;
|
||||
mod tune;
|
||||
|
||||
pub use base::*;
|
||||
pub use reduction::*;
|
||||
pub use reduction_shared_memory::*;
|
||||
pub use tune::*;
|
|
@ -1,18 +1,26 @@
|
|||
use super::{build_info, KernelSettings, SourceTemplate, StaticKernelSource, WORKGROUP_DEFAULT};
|
||||
use crate::{
|
||||
compute::StaticKernel, element::WgpuElement, kernel::elemwise_workgroup, kernel_wgsl,
|
||||
compute::StaticKernel,
|
||||
element::WgpuElement,
|
||||
kernel::{
|
||||
build_info, elemwise_workgroup, KernelSettings, SourceTemplate, StaticKernelSource,
|
||||
WORKGROUP_DEFAULT,
|
||||
},
|
||||
kernel_wgsl,
|
||||
tensor::WgpuTensor,
|
||||
};
|
||||
use burn_tensor::Shape;
|
||||
|
||||
kernel_wgsl!(RecursiveSumRaw, "../template/reduction/recursive_sum.wgsl");
|
||||
kernel_wgsl!(ReductionDimRaw, "../template/reduction/reduce_dim.wgsl");
|
||||
kernel_wgsl!(ReductionArgsRaw, "../template/reduction/args.wgsl");
|
||||
kernel_wgsl!(
|
||||
RecursiveSumRaw,
|
||||
"../../template/reduction/recursive_sum.wgsl"
|
||||
);
|
||||
kernel_wgsl!(ReductionDimRaw, "../../template/reduction/reduce_dim.wgsl");
|
||||
kernel_wgsl!(ReductionArgsRaw, "../../template/reduction/args.wgsl");
|
||||
|
||||
pub struct ArgsMax;
|
||||
pub struct ArgsMin;
|
||||
pub struct SumDim;
|
||||
pub struct MeanDim;
|
||||
pub(crate) struct ArgsMax;
|
||||
pub(crate) struct ArgsMin;
|
||||
pub(crate) struct SumDim;
|
||||
pub(crate) struct MeanDim;
|
||||
|
||||
impl StaticKernelSource for SumDim {
|
||||
fn source() -> SourceTemplate {
|
||||
|
@ -79,37 +87,29 @@ pub fn sum<E: WgpuElement, const D: usize>(input: WgpuTensor<E, D>) -> WgpuTenso
|
|||
/// Execute the sum dim kernel.
|
||||
pub fn sum_dim<E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
reduction_dim::<SumDim, E, D>(input, dim)
|
||||
reduction_dim::<SumDim, E, D>(input, output, dim)
|
||||
}
|
||||
|
||||
/// Execute the mean dim kernel.
|
||||
pub fn mean_dim<E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
reduction_dim::<MeanDim, E, D>(input, dim)
|
||||
reduction_dim::<MeanDim, E, D>(input, output, dim)
|
||||
}
|
||||
|
||||
fn reduction_dim<K: StaticKernelSource, E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
let mut shape_out = input.shape.clone();
|
||||
shape_out.dims[dim] = 1;
|
||||
let num_elems = shape_out.num_elements();
|
||||
let handle = input.client.empty(num_elems * core::mem::size_of::<E>());
|
||||
let output = WgpuTensor::new(
|
||||
input.client.clone(),
|
||||
input.device.clone(),
|
||||
shape_out,
|
||||
handle,
|
||||
);
|
||||
|
||||
let kernel =
|
||||
StaticKernel::<KernelSettings<K, E, i32, WORKGROUP_DEFAULT, WORKGROUP_DEFAULT, 1>>::new(
|
||||
elemwise_workgroup(num_elems, WORKGROUP_DEFAULT),
|
||||
elemwise_workgroup(output.shape.num_elements(), WORKGROUP_DEFAULT),
|
||||
);
|
||||
|
||||
let mut info = build_info(&[&input, &output]);
|
||||
|
@ -174,7 +174,10 @@ fn reduction_args_dim<K: StaticKernelSource, E: WgpuElement, I: WgpuElement, con
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::{ReferenceBackend, TestBackend};
|
||||
use crate::{
|
||||
kernel::reduce::init_reduce_output,
|
||||
tests::{ReferenceBackend, TestBackend},
|
||||
};
|
||||
use burn_tensor::{Distribution, Int, Tensor};
|
||||
|
||||
#[test]
|
||||
|
@ -192,10 +195,13 @@ mod tests {
|
|||
fn reduction_sum_dim_should_work_with_multiple_invocations() {
|
||||
let tensor = Tensor::<TestBackend, 2>::random([6, 1024], Distribution::Default);
|
||||
let tensor_ref = Tensor::<ReferenceBackend, 2>::from_data(tensor.to_data());
|
||||
let reduce_dim = 1;
|
||||
let output = init_reduce_output(&tensor.clone().into_primitive(), reduce_dim);
|
||||
|
||||
let val = Tensor::<TestBackend, 2>::from_primitive(reduction_dim::<SumDim, f32, 2>(
|
||||
tensor.into_primitive(),
|
||||
1,
|
||||
output,
|
||||
reduce_dim,
|
||||
));
|
||||
let val_ref = tensor_ref.sum_dim(1);
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
use crate::{
|
||||
compute::{StaticKernel, WorkGroup},
|
||||
element::WgpuElement,
|
||||
kernel::{build_info, KernelSettings, SourceTemplate, StaticKernelSource, WORKGROUP_DEFAULT},
|
||||
kernel_wgsl,
|
||||
tensor::WgpuTensor,
|
||||
};
|
||||
|
||||
kernel_wgsl!(
|
||||
ReductionDimSharedMemoryRaw,
|
||||
"../../template/reduction/reduce_dim_shared_memory.wgsl"
|
||||
);
|
||||
|
||||
pub(crate) struct SumDimSharedMemory;
|
||||
pub(crate) struct MeanDimSharedMemory;
|
||||
|
||||
impl StaticKernelSource for SumDimSharedMemory {
|
||||
fn source() -> SourceTemplate {
|
||||
ReductionDimSharedMemoryRaw::source()
|
||||
.register(
|
||||
"shared_size",
|
||||
(WORKGROUP_DEFAULT * WORKGROUP_DEFAULT).to_string(),
|
||||
)
|
||||
.register("initial", 0.0.to_string())
|
||||
.register("update", "shared_memory[local_id] += value; ")
|
||||
.register("assign", "output[output_position] = final_value; ")
|
||||
}
|
||||
}
|
||||
|
||||
impl StaticKernelSource for MeanDimSharedMemory {
|
||||
fn source() -> SourceTemplate {
|
||||
ReductionDimSharedMemoryRaw::source()
|
||||
.register(
|
||||
"shared_size",
|
||||
(WORKGROUP_DEFAULT * WORKGROUP_DEFAULT).to_string(),
|
||||
)
|
||||
.register("initial", 0.0.to_string())
|
||||
.register("update", "shared_memory[local_id] += value; ")
|
||||
.add_template(
|
||||
"fn mean_dim(sum: {{ elem }}, dim: u32) -> {{ elem }} {
|
||||
return sum / {{ elem }}(dim);
|
||||
}",
|
||||
)
|
||||
.register(
|
||||
"assign",
|
||||
"output[output_position] = mean_dim(final_value, shape_input_dim_reduce);",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute the sum dim kernel leveraging shared memory
|
||||
/// Probably more efficient on tensors where the dimension to reduced
|
||||
/// is much larger than the others
|
||||
pub fn sum_dim_shared_memory<E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
reduction_dim_shared_memory::<SumDimSharedMemory, E, D>(input, output, dim)
|
||||
}
|
||||
|
||||
/// Execute the mean dim kernel leveraging shared memory
|
||||
/// Probably more efficient on tensors where the dimension to reduced
|
||||
/// is much larger than the others
|
||||
pub fn mean_dim_shared_memory<E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
reduction_dim_shared_memory::<MeanDimSharedMemory, E, D>(input, output, dim)
|
||||
}
|
||||
|
||||
fn reduction_dim_shared_memory<K: StaticKernelSource, E: WgpuElement, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
let num_elems_output = output.shape.num_elements();
|
||||
let n_workgroups_x = f32::ceil(f32::sqrt(num_elems_output as f32));
|
||||
let n_workgroups_y = f32::ceil(num_elems_output as f32 / n_workgroups_x);
|
||||
let grid = WorkGroup::new(n_workgroups_x as u32, n_workgroups_y as u32, 1);
|
||||
|
||||
let kernel =
|
||||
StaticKernel::<KernelSettings<K, E, i32, WORKGROUP_DEFAULT, WORKGROUP_DEFAULT, 1>>::new(
|
||||
grid,
|
||||
);
|
||||
|
||||
// Build info
|
||||
let mut info = build_info(&[&input, &output]);
|
||||
|
||||
// Reduce groups are elements that are aligned along the reduce dim
|
||||
let reduce_group_size = input.shape.dims[reduce_dim];
|
||||
let n_invocation_per_workgroup = WORKGROUP_DEFAULT * WORKGROUP_DEFAULT;
|
||||
let n_reduce_elements_per_thread =
|
||||
f32::ceil(reduce_group_size as f32 / n_invocation_per_workgroup as f32) as u32;
|
||||
|
||||
// Add dimension of reduction and how many reduce elements are treated per thread
|
||||
info.push(reduce_dim as u32);
|
||||
info.push(n_reduce_elements_per_thread);
|
||||
|
||||
let info_handle = input.client.create(bytemuck::cast_slice(&info));
|
||||
|
||||
input.client.execute(
|
||||
Box::new(kernel),
|
||||
&[&input.handle, &output.handle, &info_handle],
|
||||
);
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
kernel::reduce::init_reduce_output,
|
||||
tests::{ReferenceBackend, TestBackend},
|
||||
};
|
||||
use burn_tensor::{Distribution, Tensor};
|
||||
|
||||
#[test]
|
||||
fn reduction_sum_dim_shared_memory_small() {
|
||||
let tensor = Tensor::<TestBackend, 1>::random([700], Distribution::Default);
|
||||
let tensor_ref = Tensor::<ReferenceBackend, 1>::from_data(tensor.to_data());
|
||||
let reduce_dim = 0;
|
||||
let output = init_reduce_output(&tensor.clone().into_primitive(), reduce_dim);
|
||||
|
||||
let val = Tensor::<TestBackend, 1>::from_primitive(sum_dim_shared_memory(
|
||||
tensor.into_primitive(),
|
||||
output,
|
||||
reduce_dim,
|
||||
));
|
||||
let val_ref = tensor_ref.sum_dim(reduce_dim);
|
||||
|
||||
val_ref.into_data().assert_approx_eq(&val.into_data(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reduction_sum_dim_shared_memory_medium() {
|
||||
let tensor = Tensor::<TestBackend, 2>::random([6, 1024], Distribution::Default);
|
||||
let tensor_ref = Tensor::<ReferenceBackend, 2>::from_data(tensor.to_data());
|
||||
let reduce_dim = 1;
|
||||
let output = init_reduce_output(&tensor.clone().into_primitive(), reduce_dim);
|
||||
|
||||
let val = Tensor::<TestBackend, 2>::from_primitive(sum_dim_shared_memory(
|
||||
tensor.into_primitive(),
|
||||
output,
|
||||
reduce_dim,
|
||||
));
|
||||
let val_ref = tensor_ref.sum_dim(reduce_dim);
|
||||
|
||||
val_ref.into_data().assert_approx_eq(&val.into_data(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reduction_sum_dim_shared_memory_large() {
|
||||
let tensor = Tensor::<TestBackend, 3>::random([4, 1024, 50], Distribution::Default);
|
||||
let tensor_ref = Tensor::<ReferenceBackend, 3>::from_data(tensor.to_data());
|
||||
let reduce_dim = 2;
|
||||
let output = init_reduce_output(&tensor.clone().into_primitive(), reduce_dim);
|
||||
|
||||
let val = Tensor::<TestBackend, 3>::from_primitive(sum_dim_shared_memory(
|
||||
tensor.into_primitive(),
|
||||
output,
|
||||
reduce_dim,
|
||||
));
|
||||
let val_ref = tensor_ref.sum_dim(reduce_dim);
|
||||
|
||||
val_ref.into_data().assert_approx_eq(&val.into_data(), 3);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#[macro_export]
|
||||
/// Generate an autotune operation for a reduce kernel
|
||||
macro_rules! reduce_tune_ops {
|
||||
($name:ident, $func:expr) => {
|
||||
#[derive(new)]
|
||||
pub(crate) struct $name<E: WgpuElement, const D: usize> {
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
}
|
||||
|
||||
impl<E: WgpuElement, const D: usize> AutotuneOperation for $name<E, D> {
|
||||
fn execute(self: Box<Self>) {
|
||||
#[allow(clippy::redundant_closure_call)]
|
||||
$func(self.input, self.output, self.reduce_dim);
|
||||
}
|
||||
|
||||
fn clone(&self) -> Box<dyn AutotuneOperation> {
|
||||
Box::new(Self {
|
||||
input: self.input.clone(),
|
||||
output: self.output.clone(),
|
||||
reduce_dim: self.reduce_dim.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
use std::{cmp::min, fmt::Display};
|
||||
|
||||
use burn_tensor::Shape;
|
||||
|
||||
#[derive(Hash, Eq, PartialEq, Debug, Clone)]
|
||||
/// Autotune key representative of reduce versions
|
||||
pub struct ReduceAutotuneKey {
|
||||
reduce_dim_length: usize,
|
||||
others_product: usize,
|
||||
}
|
||||
|
||||
impl Display for ReduceAutotuneKey {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
f.write_str(
|
||||
format!(
|
||||
"Reduce - reduce_dim_length: {:?} others_product: {:?}",
|
||||
self.reduce_dim_length, self.others_product
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl ReduceAutotuneKey {
|
||||
/// Create a reduce autotune key from the input shape and reduce dim
|
||||
pub fn new<const D: usize>(shape: &Shape<D>, reduce_dim: usize) -> Self {
|
||||
let reduce_dim_length = shape.dims[reduce_dim];
|
||||
let mut others_product = 1;
|
||||
for d in 0..D {
|
||||
if d != reduce_dim {
|
||||
others_product *= shape.dims[d]
|
||||
}
|
||||
}
|
||||
Self {
|
||||
reduce_dim_length: anchor(reduce_dim_length, None),
|
||||
others_product: anchor(others_product, None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn anchor(x: usize, max: Option<usize>) -> usize {
|
||||
let exp = f32::ceil(f32::log2(x as f32)) as u32;
|
||||
let power_of_2 = 2_u32.pow(exp) as usize;
|
||||
if let Some(max) = max {
|
||||
min(power_of_2, max)
|
||||
} else {
|
||||
power_of_2
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
use burn_compute::tune::{AutotuneOperation, AutotuneOperationSet};
|
||||
use burn_tensor::{Element, ElementConversion};
|
||||
|
||||
use crate::{
|
||||
compute::WgpuAutotuneKey,
|
||||
element::WgpuElement,
|
||||
kernel::{
|
||||
prng::random_like_uniform,
|
||||
reduce::{init_reduce_output, mean_dim, mean_dim_shared_memory},
|
||||
},
|
||||
ops::numeric::empty_device,
|
||||
reduce_tune_ops,
|
||||
tensor::WgpuTensor,
|
||||
};
|
||||
|
||||
use super::ReduceAutotuneKey;
|
||||
|
||||
/// Set of mean_dim implementations available for autotune
|
||||
/// Autotune key is given by concatenating the closest upper power of 2 of
|
||||
/// dim to reduce, and product of others
|
||||
pub struct MeanDimAutotuneOperationSet<E: WgpuElement, const D: usize> {
|
||||
key: WgpuAutotuneKey,
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
}
|
||||
impl<E: WgpuElement, const D: usize> MeanDimAutotuneOperationSet<E, D> {
|
||||
fn new(input: WgpuTensor<E, D>, output: WgpuTensor<E, D>, reduce_dim: usize) -> Self {
|
||||
Self {
|
||||
key: WgpuAutotuneKey::MeanDim(ReduceAutotuneKey::new(&input.shape, reduce_dim)),
|
||||
input,
|
||||
output,
|
||||
reduce_dim,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: WgpuElement + Element, const D: usize> AutotuneOperationSet<WgpuAutotuneKey>
|
||||
for MeanDimAutotuneOperationSet<E, D>
|
||||
{
|
||||
fn key(&self) -> WgpuAutotuneKey {
|
||||
self.key.clone()
|
||||
}
|
||||
|
||||
fn autotunables(&self) -> Vec<Box<dyn AutotuneOperation>> {
|
||||
let random_bounds: (E, E) = ((-10.0).elem::<E>(), (10.0).elem::<E>());
|
||||
let input = random_like_uniform(&self.input, random_bounds.0, random_bounds.1);
|
||||
|
||||
let output = empty_device(
|
||||
self.output.client.clone(),
|
||||
self.output.device.clone(),
|
||||
self.output.shape.clone(),
|
||||
);
|
||||
|
||||
vec![
|
||||
Box::new(MeanDimAutotune::<E, D>::new(
|
||||
input.clone(),
|
||||
output.clone(),
|
||||
self.reduce_dim,
|
||||
)),
|
||||
Box::new(MeanDimSharedMemoryAutotune::<E, D>::new(
|
||||
input.clone(),
|
||||
output.clone(),
|
||||
self.reduce_dim,
|
||||
)),
|
||||
]
|
||||
}
|
||||
|
||||
fn fastest(self: Box<Self>, fastest_index: usize) -> Box<dyn AutotuneOperation> {
|
||||
// Warning: since AutotuneOperationSet shares his key with SumDimAutotuneOperationSet
|
||||
// we must make sure the order here is correlated with SumDim
|
||||
match fastest_index {
|
||||
0 => Box::new(MeanDimAutotune::<E, D>::new(
|
||||
self.input,
|
||||
self.output,
|
||||
self.reduce_dim,
|
||||
)),
|
||||
1 => Box::new(MeanDimSharedMemoryAutotune::<E, D>::new(
|
||||
self.input,
|
||||
self.output,
|
||||
self.reduce_dim,
|
||||
)),
|
||||
_ => panic!("Fastest index is out of bound"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes autotune on mean_dim operation
|
||||
pub fn mean_dim_autotune<E: WgpuElement + Element, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
let client = input.client.clone();
|
||||
|
||||
let output = init_reduce_output(&input, reduce_dim);
|
||||
|
||||
let operation_set = Box::new(MeanDimAutotuneOperationSet::<E, D>::new(
|
||||
input,
|
||||
output.clone(),
|
||||
reduce_dim,
|
||||
));
|
||||
|
||||
client.execute_autotune(operation_set);
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
// Probably better on balanced tensor shapes
|
||||
reduce_tune_ops!(MeanDimAutotune, mean_dim);
|
||||
|
||||
// Probably better on tensors large along reduce dim
|
||||
reduce_tune_ops!(MeanDimSharedMemoryAutotune, mean_dim_shared_memory);
|
|
@ -0,0 +1,9 @@
|
|||
mod base;
|
||||
mod key;
|
||||
mod mean_dim;
|
||||
mod sum_dim;
|
||||
|
||||
pub use base::*;
|
||||
pub use key::*;
|
||||
pub use mean_dim::*;
|
||||
pub use sum_dim::*;
|
|
@ -0,0 +1,112 @@
|
|||
use burn_compute::tune::{AutotuneOperation, AutotuneOperationSet};
|
||||
use burn_tensor::{Element, ElementConversion};
|
||||
|
||||
use crate::{
|
||||
compute::WgpuAutotuneKey,
|
||||
element::WgpuElement,
|
||||
kernel::{
|
||||
prng::random_like_uniform,
|
||||
reduce::{init_reduce_output, sum_dim, sum_dim_shared_memory},
|
||||
},
|
||||
ops::numeric::empty_device,
|
||||
reduce_tune_ops,
|
||||
tensor::WgpuTensor,
|
||||
};
|
||||
|
||||
use super::ReduceAutotuneKey;
|
||||
|
||||
/// Set of sum_dim implementations available for autotune
|
||||
/// Autotune key is given by concatenating the closest upper power of 2 of
|
||||
/// dim to reduce, and product of others
|
||||
pub struct SumDimAutotuneOperationSet<E: WgpuElement, const D: usize> {
|
||||
key: WgpuAutotuneKey,
|
||||
input: WgpuTensor<E, D>,
|
||||
output: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
}
|
||||
impl<E: WgpuElement, const D: usize> SumDimAutotuneOperationSet<E, D> {
|
||||
fn new(input: WgpuTensor<E, D>, output: WgpuTensor<E, D>, reduce_dim: usize) -> Self {
|
||||
Self {
|
||||
key: WgpuAutotuneKey::SumDim(ReduceAutotuneKey::new(&input.shape, reduce_dim)),
|
||||
input,
|
||||
output,
|
||||
reduce_dim,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: WgpuElement + Element, const D: usize> AutotuneOperationSet<WgpuAutotuneKey>
|
||||
for SumDimAutotuneOperationSet<E, D>
|
||||
{
|
||||
fn key(&self) -> WgpuAutotuneKey {
|
||||
self.key.clone()
|
||||
}
|
||||
|
||||
fn autotunables(&self) -> Vec<Box<dyn AutotuneOperation>> {
|
||||
let random_bounds: (E, E) = ((-10.0).elem::<E>(), (10.0).elem::<E>());
|
||||
let input = random_like_uniform(&self.input, random_bounds.0, random_bounds.1);
|
||||
|
||||
let output = empty_device(
|
||||
self.output.client.clone(),
|
||||
self.output.device.clone(),
|
||||
self.output.shape.clone(),
|
||||
);
|
||||
|
||||
vec![
|
||||
Box::new(SumDimAutotune::<E, D>::new(
|
||||
input.clone(),
|
||||
output.clone(),
|
||||
self.reduce_dim,
|
||||
)),
|
||||
Box::new(SumDimSharedMemoryAutotune::<E, D>::new(
|
||||
input.clone(),
|
||||
output.clone(),
|
||||
self.reduce_dim,
|
||||
)),
|
||||
]
|
||||
}
|
||||
|
||||
fn fastest(self: Box<Self>, fastest_index: usize) -> Box<dyn AutotuneOperation> {
|
||||
// Warning: since AutotuneOperationSet shares his key with MeanDimAutotuneOperationSet
|
||||
// we must make sure the order here is correlated with MeanDim
|
||||
match fastest_index {
|
||||
0 => Box::new(SumDimAutotune::<E, D>::new(
|
||||
self.input,
|
||||
self.output,
|
||||
self.reduce_dim,
|
||||
)),
|
||||
1 => Box::new(SumDimSharedMemoryAutotune::<E, D>::new(
|
||||
self.input,
|
||||
self.output,
|
||||
self.reduce_dim,
|
||||
)),
|
||||
_ => panic!("Fastest index is out of bound"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes autotune on sum_dim operation
|
||||
pub fn sum_dim_autotune<E: WgpuElement + Element, const D: usize>(
|
||||
input: WgpuTensor<E, D>,
|
||||
reduce_dim: usize,
|
||||
) -> WgpuTensor<E, D> {
|
||||
let client = input.client.clone();
|
||||
|
||||
let output = init_reduce_output(&input, reduce_dim);
|
||||
|
||||
let operation_set = Box::new(SumDimAutotuneOperationSet::<E, D>::new(
|
||||
input,
|
||||
output.clone(),
|
||||
reduce_dim,
|
||||
));
|
||||
|
||||
client.execute_autotune(operation_set);
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
// Probably better on balanced tensor shapes
|
||||
reduce_tune_ops!(SumDimAutotune, sum_dim);
|
||||
|
||||
// Probably better on tensors large along reduce dim
|
||||
reduce_tune_ops!(SumDimSharedMemoryAutotune, sum_dim_shared_memory);
|
|
@ -6,8 +6,11 @@ use crate::kernel::matmul::matmul_autotune;
|
|||
#[cfg(not(feature = "autotune"))]
|
||||
use crate::kernel::matmul::vec4::matmul_tiling_2d_vec4;
|
||||
use crate::kernel::prng::{random_bernoulli, random_normal, random_uniform};
|
||||
#[cfg(not(feature = "autotune"))]
|
||||
use crate::kernel::reduce::init_reduce_output;
|
||||
use crate::kernel::{
|
||||
self, unary_default, unary_inplace_default, unary_scalar_default, unary_scalar_inplace_default,
|
||||
self, reduce, unary_default, unary_inplace_default, unary_scalar_default,
|
||||
unary_scalar_inplace_default,
|
||||
};
|
||||
use crate::{unary, unary_inplace, unary_scalar, FloatElement, GraphicsApi, IntElement, Wgpu};
|
||||
use crate::{unary_scalar_inplace, WgpuDevice};
|
||||
|
@ -311,15 +314,33 @@ where
|
|||
}
|
||||
|
||||
fn sum<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, 1> {
|
||||
kernel::sum(tensor)
|
||||
reduce::sum(tensor)
|
||||
}
|
||||
|
||||
fn sum_dim<const D: usize>(tensor: FloatTensor<Self, D>, dim: usize) -> FloatTensor<Self, D> {
|
||||
kernel::sum_dim(tensor, dim)
|
||||
#[cfg(feature = "autotune")]
|
||||
{
|
||||
reduce::sum_dim_autotune(tensor, dim)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "autotune"))]
|
||||
{
|
||||
let output = init_reduce_output(&tensor, dim);
|
||||
reduce::sum_dim(tensor, output, dim)
|
||||
}
|
||||
}
|
||||
|
||||
fn mean_dim<const D: usize>(tensor: FloatTensor<Self, D>, dim: usize) -> FloatTensor<Self, D> {
|
||||
kernel::mean_dim(tensor, dim)
|
||||
#[cfg(feature = "autotune")]
|
||||
{
|
||||
reduce::mean_dim_autotune(tensor, dim)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "autotune"))]
|
||||
{
|
||||
let output = init_reduce_output(&tensor, dim);
|
||||
reduce::mean_dim(tensor, output, dim)
|
||||
}
|
||||
}
|
||||
|
||||
fn to_full_precision<const D: usize>(
|
||||
|
@ -457,11 +478,11 @@ where
|
|||
}
|
||||
|
||||
fn argmax<const D: usize>(tensor: FloatTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::argmax(tensor, dim)
|
||||
reduce::argmax(tensor, dim)
|
||||
}
|
||||
|
||||
fn argmin<const D: usize>(tensor: FloatTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::argmin(tensor, dim)
|
||||
reduce::argmin(tensor, dim)
|
||||
}
|
||||
|
||||
fn into_int<const D: usize>(tensor: FloatTensor<Self, D>) -> IntTensor<Self, D> {
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
use super::numeric;
|
||||
|
||||
use crate::kernel::reduce::{self, init_reduce_output};
|
||||
use crate::kernel::{unary_default, unary_inplace_default};
|
||||
use crate::{
|
||||
element::{FloatElement, IntElement},
|
||||
|
@ -257,23 +259,25 @@ where
|
|||
}
|
||||
|
||||
fn int_sum<const D: usize>(tensor: IntTensor<Self, D>) -> IntTensor<Self, 1> {
|
||||
kernel::sum(tensor)
|
||||
kernel::reduce::sum(tensor)
|
||||
}
|
||||
|
||||
fn int_sum_dim<const D: usize>(tensor: IntTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::sum_dim(tensor, dim)
|
||||
let output = init_reduce_output(&tensor, dim);
|
||||
reduce::sum_dim(tensor, output, dim)
|
||||
}
|
||||
|
||||
fn int_mean_dim<const D: usize>(tensor: IntTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::mean_dim(tensor, dim)
|
||||
let output = init_reduce_output(&tensor, dim);
|
||||
reduce::mean_dim(tensor, output, dim)
|
||||
}
|
||||
|
||||
fn int_argmax<const D: usize>(tensor: IntTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::argmax(tensor, dim)
|
||||
kernel::reduce::argmax(tensor, dim)
|
||||
}
|
||||
|
||||
fn int_argmin<const D: usize>(tensor: IntTensor<Self, D>, dim: usize) -> IntTensor<Self, D> {
|
||||
kernel::argmin(tensor, dim)
|
||||
kernel::reduce::argmin(tensor, dim)
|
||||
}
|
||||
|
||||
fn int_clamp_min<const D: usize>(
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
@group(0)
|
||||
@binding(0)
|
||||
var<storage, read> input: array<{{ elem }}>;
|
||||
|
||||
@group(0)
|
||||
@binding(1)
|
||||
var<storage, read_write> output: array<{{ elem }}>;
|
||||
|
||||
@group(0)
|
||||
@binding(2)
|
||||
var<storage, read> info: array<u32>;
|
||||
|
||||
var<workgroup> shared_memory: array<{{ elem }}, {{ shared_size }}>;
|
||||
|
||||
const WORKGROUP_SIZE_X = {{ workgroup_size_x }}u;
|
||||
|
||||
@compute
|
||||
@workgroup_size({{ workgroup_size_x }}, {{ workgroup_size_y }}, 1)
|
||||
fn main(
|
||||
@builtin(local_invocation_id) local_invocation_id: vec3<u32>,
|
||||
@builtin(num_workgroups) num_workgroups: vec3<u32>,
|
||||
@builtin(workgroup_id) workgroup_id: vec3<u32>,
|
||||
) {
|
||||
let workgroup_size_x = {{ workgroup_size_x }}u;
|
||||
let workgroup_size_y = {{ workgroup_size_y }}u;
|
||||
|
||||
// To determine which reduce_group (not position, but absolute id)
|
||||
let reduce_group_id = workgroup_id.y * num_workgroups.x + workgroup_id.x;
|
||||
|
||||
// nth thread in the workgroup
|
||||
let local_id = local_invocation_id.y * workgroup_size_x + local_invocation_id.x;
|
||||
|
||||
// rank of the tensors
|
||||
let rank: u32 = info[0];
|
||||
// dimension on which to reduce (in 0..rank)
|
||||
let dim_reduce = info[4u * rank + 1u];
|
||||
// threads are responsible of how many inputs in one reduce_group
|
||||
let n_input_values_per_thread = info[4u * rank + 2u];
|
||||
|
||||
let stride_input_dim_reduce = info[dim_reduce + 1u];
|
||||
let shape_input_dim_reduce = info[dim_reduce + 1u + 2u * rank];
|
||||
var n_threads = workgroup_size_x * workgroup_size_y;
|
||||
|
||||
var index_offset: u32 = 0u;
|
||||
|
||||
for (var i: u32 = 0u; i < rank; i++) {
|
||||
let stride_input = info[i + 1u];
|
||||
let stride_output = info[i + 1u + rank];
|
||||
let shape_output = info[i + 1u + 3u * rank];
|
||||
|
||||
let num_block = reduce_group_id / stride_output % shape_output;
|
||||
index_offset += num_block * stride_input;
|
||||
}
|
||||
|
||||
// Ensure shared memory starts at 0
|
||||
shared_memory[local_id] = {{ elem }}(0);
|
||||
|
||||
for (var i = 0u; i < n_input_values_per_thread; i++) {
|
||||
let nth = local_id + i * n_threads;
|
||||
if nth < shape_input_dim_reduce {
|
||||
let current_position = index_offset + nth * stride_input_dim_reduce;
|
||||
let value = input[current_position];
|
||||
|
||||
{{ update }}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
let reduce_factor = 2u;
|
||||
while n_threads > 1u {
|
||||
n_threads /= reduce_factor;
|
||||
|
||||
if local_id < n_threads {
|
||||
for (var i = 1u; i < reduce_factor; i++) {
|
||||
let read_position = local_id + i * n_threads;
|
||||
let value = shared_memory[read_position];
|
||||
|
||||
{{ update }}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
if local_id == 0u {
|
||||
let output_position = reduce_group_id;
|
||||
let final_value = shared_memory[0u];
|
||||
|
||||
{{ assign }}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue