[Refactor] Just-In-Time Compilation Pipeline (#1313)

2024-02-16 14:45:59 -05:00 · 2024-02-16 14:45:59 -05:00 · 843dd492c2
parent 24287237d1
commit 843dd492c2
37 changed files with 2633 additions and 1710 deletions
--- a/burn-wgpu/src/codegen/compilation.rs
+++ b/burn-wgpu/src/codegen/compilation.rs
@ -0,0 +1,261 @@
+use super::dialect::gpu;
+use crate::codegen::dialect::gpu::{
+    Binding, ComputeShader, Elem, Item, Location, Variable, Vectorization, Visibility,
+    WorkgroupSize,
+};
+
+/// The compilation struct allows you to create a [compute shader](ComputeShader) based on
+/// [compilation info](CompilationInfo) and [compilation settings](CompilationSettings).
+#[derive(Clone)]
+pub struct Compilation {
+    info: CompilationInfo,
+    input_bindings: Vec<Binding>,
+    output_bindings: Vec<Binding>,
+    named_bindings: Vec<(String, Binding)>,
+}
+
+/// The information necessary to compile a [compute shader](ComputeShader).
+#[derive(Clone)]
+pub struct CompilationInfo {
+    pub inputs: Vec<InputInfo>,
+    pub outputs: Vec<OutputInfo>,
+    pub scope: gpu::Scope,
+    pub mappings: Vec<InplaceMapping>,
+}
+
+/// Simply indicate the output that can be replaced by the input.
+#[derive(new, Clone, Copy)]
+pub struct InplaceMapping {
+    /// Input position.
+    pub pos_input: usize,
+    /// Output position.
+    pub pos_output: usize,
+}
+
+#[derive(Default)]
+pub struct CompilationSettings {
+    vectorization: Vectorization,
+    inplace_available: bool,
+    workgroup_size: WorkgroupSize,
+}
+
+impl CompilationSettings {
+    /// Compile the shader with vectorization enabled.
+    #[allow(dead_code)]
+    pub fn vectorize(mut self, vectorization: Vectorization) -> Self {
+        self.vectorization = vectorization;
+        self
+    }
+    /// Compile the shader with inplace enabled.
+    ///
+    /// Notes:
+    ///
+    /// This won't guarantee that the shader will use input arrays as outputs, since it is only
+    /// possible when [inplace mappings](InplaceMapping) are provided as [compilation info](CompilationInfo)
+    pub fn inplace(mut self, available: bool) -> Self {
+        self.inplace_available = available;
+        self
+    }
+    /// Set the grid size.
+    #[allow(dead_code)] // Only used for fusion for now.
+    pub fn workgroup_size(mut self, workgroup_size: WorkgroupSize) -> Self {
+        self.workgroup_size = workgroup_size;
+        self
+    }
+}
+
+/// Information related to an input.
+#[derive(Clone)]
+pub enum InputInfo {
+    Array { item: Item, visibility: Visibility },
+    Scalar { elem: Elem, size: usize },
+}
+
+/// Information related to an output.
+#[derive(Clone)]
+pub enum OutputInfo {
+    /// Write the local variable to a new array.
+    ///
+    /// This will create a new binding in the [compute shader](ComputeShader).
+    Array { item: Item, local: u16 },
+    /// Write the local variable to an existing input binding.
+    Input { item: Item, input: u16, local: u16 },
+}
+
+impl Compilation {
+    /// Starts a new compilation.
+    pub fn new(info: CompilationInfo) -> Self {
+        Self {
+            info,
+            input_bindings: Default::default(),
+            output_bindings: Default::default(),
+            named_bindings: Default::default(),
+        }
+    }
+
+    /// Performs the compilation with the provided [settings](CompilationSettings).
+    pub fn compile(mut self, settings: CompilationSettings) -> ComputeShader {
+        self.info.scope.vectorize(settings.vectorization);
+
+        self.register_inputs(&settings);
+        self.register_outputs(&settings);
+
+        let inputs = self.input_bindings;
+        let outputs = self.output_bindings;
+        let mut named = Vec::with_capacity(2);
+
+        named.push((
+            "info".to_string(),
+            Binding {
+                item: Item::Scalar(Elem::UInt),
+                visibility: Visibility::Read,
+                location: Location::Storage,
+                size: None, // We avoid putting the length here since it will force a new kernel
+                            // for each tensor rank.
+            },
+        ));
+
+        for (name, binding) in self.named_bindings.into_iter() {
+            named.push((name, binding));
+        }
+
+        ComputeShader {
+            inputs,
+            outputs,
+            named,
+            workgroup_size: settings.workgroup_size,
+            body: self.info.scope,
+            num_workgroups: true,
+            global_invocation_id: true,
+        }
+    }
+
+    fn register_inputs(&mut self, settings: &CompilationSettings) {
+        for input in self.info.inputs.drain(..) {
+            match input {
+                InputInfo::Array { item, visibility } => {
+                    let item = item.vectorize(settings.vectorization);
+
+                    self.input_bindings.push(Binding {
+                        item: bool_item(item),
+                        visibility,
+                        location: Location::Storage,
+                        size: None,
+                    });
+                }
+                InputInfo::Scalar { elem, size } => {
+                    let elem = bool_elem(elem);
+
+                    self.named_bindings.push((
+                        format!("scalars_{}", elem),
+                        Binding {
+                            item: Item::Scalar(elem),
+                            visibility: Visibility::Read,
+                            location: Location::Storage,
+                            size: Some(size),
+                        },
+                    ));
+                }
+            }
+        }
+    }
+
+    fn register_outputs(&mut self, settings: &CompilationSettings) {
+        let mut index = 0;
+
+        if settings.inplace_available {
+            let mut mappings = Vec::new();
+            core::mem::swap(&mut self.info.mappings, &mut mappings);
+
+            for mapping in mappings {
+                self.register_inplace_mapping(mapping);
+            }
+        }
+
+        for array in self.info.outputs.drain(..) {
+            match array {
+                OutputInfo::Array { item, local } => {
+                    let item = item.vectorize(settings.vectorization);
+                    let elem_adapted = bool_item(item);
+
+                    self.output_bindings.push(Binding {
+                        item: elem_adapted,
+                        visibility: Visibility::ReadWrite,
+                        location: Location::Storage,
+                        size: None,
+                    });
+                    self.info.scope.write_global(
+                        Variable::Local(local, item, self.info.scope.depth),
+                        Variable::GlobalOutputArray(index, elem_adapted),
+                    );
+                    index += 1;
+                }
+                OutputInfo::Input { item, input, local } => {
+                    let item = item.vectorize(settings.vectorization);
+
+                    self.info.scope.write_global(
+                        Variable::Local(local, item, self.info.scope.depth),
+                        Variable::GlobalInputArray(input, bool_item(item)),
+                    );
+                }
+            }
+        }
+    }
+
+    fn register_inplace_mapping(&mut self, mapping: InplaceMapping) {
+        let output = match self.info.outputs.get_mut(mapping.pos_output) {
+            Some(output) => output,
+            None => return, // No output to update.
+        };
+
+        let (item, local) = match output {
+            OutputInfo::Array { item, local } => (item, local),
+            OutputInfo::Input {
+                item: _,
+                input: _,
+                local: _,
+            } => return, // Output already updated.
+        };
+
+        let item = match self.input_bindings.get_mut(mapping.pos_input) {
+            Some(binding) => {
+                // Update input visibility.
+                binding.visibility = Visibility::ReadWrite;
+                // Inputs modified inplace should be read without any specified layout.
+                self.info
+                    .scope
+                    .update_read(mapping.pos_input as u16, gpu::ReadingStrategy::Plain);
+
+                // Use the same item as the input.
+                //
+                // The output can be different (i.e inplace boolean operations on float bindings).
+                binding.item
+            }
+            None => *item,
+        };
+
+        // Update the output.
+        *output = OutputInfo::Input {
+            item,
+            input: mapping.pos_input as u16,
+            local: *local,
+        };
+    }
+}
+
+fn bool_item(ty: Item) -> Item {
+    match ty {
+        Item::Vec4(elem) => Item::Vec4(bool_elem(elem)),
+        Item::Vec3(elem) => Item::Vec3(bool_elem(elem)),
+        Item::Vec2(elem) => Item::Vec2(bool_elem(elem)),
+        Item::Scalar(elem) => Item::Scalar(bool_elem(elem)),
+    }
+}
+
+fn bool_elem(elem: Elem) -> Elem {
+    match elem {
+        // U32 are used for bool tensors
+        Elem::Bool => Elem::UInt,
+        _ => elem,
+    }
+}
--- a/burn-wgpu/src/codegen/dialect/gpu/algorithm.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/algorithm.rs
@ -0,0 +1,57 @@
+use super::{
+    gpu, Elem, Item, Metadata, Operator, ReadGlobalAlgo, ReadGlobalWithLayoutAlgo, Scope, Variable,
+};
+use crate::codegen::dialect::gpu::BinaryOperator;
+
+impl ReadGlobalAlgo {
+    pub fn expand(self, scope: &mut Scope) {
+        scope.register(Operator::Index(BinaryOperator {
+            lhs: self.global,
+            rhs: Variable::Id,
+            out: self.out,
+        }));
+    }
+}
+
+impl ReadGlobalWithLayoutAlgo {
+    pub fn expand(self, scope: &mut Scope) {
+        let out = self.out;
+        let tensor = self.global;
+        let layout = self.layout;
+        let index_item_ty = Item::Scalar(Elem::UInt);
+        let index_local = scope.create_local(index_item_ty);
+        let zero: Variable = 0u32.into();
+        let id = Variable::Id;
+        let offset: Variable = match self.global.item() {
+            Item::Vec4(_) => 4u32,
+            Item::Vec3(_) => 3u32,
+            Item::Vec2(_) => 2u32,
+            Item::Scalar(_) => 1u32,
+        }
+        .into();
+
+        gpu!(scope, index_local = zero);
+        gpu!(
+            scope,
+            range(zero, Variable::Rank).for_each(|i, scope| {
+                let stride = scope.create_local(index_item_ty);
+                let stride_layout = scope.create_local(index_item_ty);
+                let shape = scope.create_local(index_item_ty);
+                let tmp = scope.create_local(index_item_ty);
+
+                gpu!(scope, stride = stride(tensor, i));
+                gpu!(scope, shape = shape(tensor, i));
+                gpu!(scope, stride_layout = stride(layout, i));
+
+                gpu!(scope, tmp = id * offset);
+                gpu!(scope, tmp = tmp / stride_layout);
+                gpu!(scope, tmp = tmp % shape);
+                gpu!(scope, tmp = tmp * stride);
+                gpu!(scope, index_local = index_local + tmp);
+            })
+        );
+
+        gpu!(scope, index_local = index_local / offset);
+        gpu!(scope, out = tensor[index_local]);
+    }
+}
--- a/burn-wgpu/src/codegen/dialect/gpu/body.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/body.rs
@ -1,7 +0,0 @@
-use super::Operation;
-use serde::{Deserialize, Serialize};
-
-#[derive(Debug, Clone, Serialize, Deserialize, new)]
-pub struct Body {
-    pub operators: Vec<Operation>,
-}
--- a/burn-wgpu/src/codegen/dialect/gpu/macros.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/macros.rs
@ -0,0 +1,260 @@
+use super::Variable;
+
+macro_rules! gpu {
+    // out = lhs + rhs
+    ($scope:expr, $out:ident = $lhs:ident + $rhs:expr) => {
+        gpu!($scope, $out = add($lhs, $rhs))
+    };
+    // out = add(lhs, rhs)
+    ($scope:expr, $out:ident = add($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Add(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs - rhs
+    ($scope:expr, $out:ident = $lhs:ident - $rhs:expr) => {
+        gpu!($scope, $out = sub($lhs, $rhs))
+    };
+    // out = sub(lhs, rhs)
+    ($scope:expr, $out:ident = sub($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Sub(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs * rhs
+    ($scope:expr, $out:ident = $lhs:ident * $rhs:expr) => {
+        gpu!($scope, $out = mul($lhs, $rhs))
+    };
+    // out = mul(lhs, rhs)
+    ($scope:expr, $out:ident = mul($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Mul(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs / rhs
+    ($scope:expr, $out:ident = $lhs:ident / $rhs:expr) => {
+        gpu!($scope, $out = div($lhs, $rhs))
+    };
+    // out = div(lhs, rhs)
+    ($scope:expr, $out:ident = div($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Div(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs % rhs
+    ($scope:expr, $out:ident = $lhs:ident % $rhs:expr) => {
+        gpu!($scope, $out = modulo($lhs, $rhs))
+    };
+    // out = modulo(lhs, rhs)
+    ($scope:expr, $out:ident = modulo($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Modulo(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs ^ rhs
+    ($scope:expr, $out:ident = $lhs:ident ^ $rhs:expr) => {
+        gpu!($scope, $out = powf($lhs, $rhs))
+    };
+    // out = powf(lhs, rhs)
+    ($scope:expr, $out:ident = powf($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Powf(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs == rhs
+    ($scope:expr, $out:ident = $lhs:ident == $rhs:expr) => {
+        gpu!($scope, $out = equal($lhs, $rhs))
+    };
+    // out = equal(lhs, rhs)
+    ($scope:expr, $out:ident = equal($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Equal(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs > rhs
+    ($scope:expr, $out:ident = $lhs:ident > $rhs:expr) => {
+        gpu!($scope, $out = greater($lhs, $rhs))
+    };
+    // out = greater(lhs, rhs)
+    ($scope:expr, $out:ident = greater($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Greater(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs >= rhs
+    ($scope:expr, $out:ident = $lhs:ident >= $rhs:expr) => {
+        gpu!($scope, $out = greater_equal($lhs, $rhs))
+    };
+    // out = greater_equal(lhs, rhs)
+    ($scope:expr, $out:ident = greater_equal($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::GreaterEqual(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs < rhs
+    ($scope:expr, $out:ident = $lhs:ident < $rhs:expr) => {
+        gpu!($scope, $out = lower($lhs, $rhs))
+    };
+    // out = lower(lhs, rhs)
+    ($scope:expr, $out:ident = lower($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Lower(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs <= rhs
+    ($scope:expr, $out:ident = $lhs:ident <= $rhs:expr) => {
+        gpu!($scope, $out = lower_equal($lhs, $rhs))
+    };
+    // out = lower_equal(lhs, rhs)
+    ($scope:expr, $out:ident = lower_equal($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::LowerEqual(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = lhs[rhs]
+    ($scope:expr, $out:ident = $lhs:ident[$rhs:expr]) => {
+        gpu!($scope, $out = index($lhs, $rhs))
+    };
+    // out = index(lhs, rhs)
+    ($scope:expr, $out:ident = index($lhs:expr, $rhs:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Index(
+            gpu!(binary $lhs, $rhs, $out)
+        ));
+    };
+    // out = |input|
+    ($scope:expr, $out:ident = |$input:ident|) => {
+        gpu!($scope, $out = abs($input))
+    };
+    // out = abs(input)
+    ($scope:expr, $out:ident = abs($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Abs(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = exp(input)
+    ($scope:expr, $out:ident = exp($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Exp(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = log(input)
+    ($scope:expr, $out:ident = log($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Log(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = log1p(input)
+    ($scope:expr, $out:ident = log1p($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Log1p(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = cos(input)
+    ($scope:expr, $out:ident = cos($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Cos(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = sin(input)
+    ($scope:expr, $out:ident = sin($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Sin(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = tanh(input)
+    ($scope:expr, $out:ident = tanh($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Tanh(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = sqrt(input)
+    ($scope:expr, $out:ident = sqrt($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Sqrt(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = erf(input)
+    ($scope:expr, $out:ident = erf($input:expr)) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::Erf(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = input
+    ($scope:expr, eval $arg:expr) => {
+        gpu!($scope, $arg);
+    };
+    // out = input
+    ($scope:expr, $out:ident = $input:ident) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::AssignLocal(
+            gpu!(unary $input, $out)
+        ));
+    };
+    // out = input
+    ($scope:expr, $out:ident = $input:ident) => {
+        $scope.register($crate::codegen::dialect::gpu::Operator::AssignLocal(
+            gpu!(unary $input, $out)
+        ));
+    };
+    ($scope:expr, $out:ident = shape($input:expr, $dim:expr)) => {
+        $scope.register(Metadata::Shape {
+            dim: $dim,
+            var: $input,
+            out: $out,
+        });
+    };
+    ($scope:expr, $out:ident = stride($input:expr, $dim:expr)) => {
+        $scope.register(Metadata::Stride {
+            dim: $dim,
+            var: $input,
+            out: $out,
+        });
+    };
+    ($scope:expr, range($start:expr, $end:expr).for_each($arg:expr)) => {
+        $crate::codegen::dialect::gpu::RangeLoop::register($scope, $start.into(), $end.into(), $arg);
+    };
+    (binary $lhs:expr, $rhs:expr, $out:expr) => {
+        $crate::codegen::dialect::gpu::BinaryOperator {
+            lhs: $lhs.into(),
+            rhs: $rhs.into(),
+            out: $out.into(),
+        }
+    };
+    (unary $input:expr, $out:expr) => {
+        $crate::codegen::dialect::gpu::UnaryOperator {
+            input: $input.into(),
+            out: $out.into(),
+        }
+    };
+}
+
+impl From<bool> for Variable {
+    fn from(value: bool) -> Self {
+        Self::ConstantScalar(if value { 1.0 } else { 0.0 }, super::Elem::Bool)
+    }
+}
+
+impl From<i32> for Variable {
+    fn from(value: i32) -> Self {
+        Self::ConstantScalar(value as f64, super::Elem::Int)
+    }
+}
+
+impl From<f32> for Variable {
+    fn from(value: f32) -> Self {
+        Self::ConstantScalar(value as f64, super::Elem::Float)
+    }
+}
+
+impl From<u32> for Variable {
+    fn from(value: u32) -> Self {
+        Self::ConstantScalar(value as f64, super::Elem::UInt)
+    }
+}
+
+impl From<usize> for Variable {
+    fn from(value: usize) -> Self {
+        Self::ConstantScalar(value as f64, super::Elem::UInt)
+    }
+}
+
+pub(crate) use gpu;
--- a/burn-wgpu/src/codegen/dialect/gpu/mod.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/mod.rs
@ -1,11 +1,15 @@
-mod body;
+pub(crate) mod algorithm;
+
+mod macros;
 mod operation;
+mod scope;
 mod shader;
 mod variable;
 mod vectorization;

-pub(crate) use body::*;
+pub(crate) use macros::gpu;
 pub(crate) use operation::*;
+pub(crate) use scope::*;
 pub(crate) use shader::*;
 pub(crate) use variable::*;
 pub(crate) use vectorization::*;
--- a/burn-wgpu/src/codegen/dialect/gpu/operation.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/operation.rs
@ -1,56 +1,169 @@
-use super::Variable;
+use super::{Elem, Item, Scope, Variable};
 use serde::{Deserialize, Serialize};

 /// All operations that can be used in a GPU compute shader.
+///
+/// Notes:
+///
+/// [Operator] and [Algorithm] can be vectorized, but [Metadata] and [Loop] can't.
+/// Therefore, during tracing, only operators and algorithms can be registered, and during the
+/// compilation phase, the algorithm will be expanded.
+///
+/// Algorithm expansions can safely use [Metadata] and [Loop] operations.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[allow(dead_code)] // Some variants might not be used with different flags
 pub enum Operation {
-    Add(BinaryOperation),
-    Sub(BinaryOperation),
-    Mul(BinaryOperation),
-    Div(BinaryOperation),
-    Abs(UnaryOperation),
-    Exp(UnaryOperation),
-    Log(UnaryOperation),
-    Log1p(UnaryOperation),
-    Cos(UnaryOperation),
-    Sin(UnaryOperation),
-    Tanh(UnaryOperation),
-    Powf(BinaryOperation),
-    Sqrt(UnaryOperation),
-    Erf(UnaryOperation),
-    Recip(UnaryOperation),
-    Equal(BinaryOperation),
-    Lower(BinaryOperation),
-    Clamp(ClampOperation),
-    Greater(BinaryOperation),
-    LowerEqual(BinaryOperation),
-    GreaterEqual(BinaryOperation),
-    ConditionalAssign(ConditionalAssignOperation),
-    AssignGlobal(UnaryOperation),
-    AssignLocal(UnaryOperation),
-    ReadGlobal(ReadGlobalOperation),
-    ReadGlobalWithLayout(ReadGlobalWithLayoutOperation),
+    Operator(Operator),
+    Metadata(Metadata),
+    Algorithm(Algorithm),
+    Loop(Loop),
+}
+
+/// All operator that can be used in a GPU compute shader.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[allow(dead_code)] // Some variants might not be used with different flags
+pub enum Operator {
+    Add(BinaryOperator),
+    Sub(BinaryOperator),
+    Mul(BinaryOperator),
+    Div(BinaryOperator),
+    Abs(UnaryOperator),
+    Exp(UnaryOperator),
+    Log(UnaryOperator),
+    Log1p(UnaryOperator),
+    Cos(UnaryOperator),
+    Sin(UnaryOperator),
+    Tanh(UnaryOperator),
+    Powf(BinaryOperator),
+    Sqrt(UnaryOperator),
+    Erf(UnaryOperator),
+    Recip(UnaryOperator),
+    Equal(BinaryOperator),
+    Lower(BinaryOperator),
+    Clamp(ClampOperator),
+    Greater(BinaryOperator),
+    LowerEqual(BinaryOperator),
+    GreaterEqual(BinaryOperator),
+    ConditionalAssign(ConditionalAssignOperator),
+    AssignGlobal(UnaryOperator),
+    AssignLocal(UnaryOperator),
+    Modulo(BinaryOperator),
+    Index(BinaryOperator),
+}
+
+/// Tensor operations that can't be executed with a simple [operator](Operator) should use an
+/// algorithm.
+///
+/// Algorithms can be expanded to basic [operator](Operator) during compilation, but after
+/// vectorization, since for loops and other construct can't simply be vectorized. This also gives
+/// the vectorization state to the expansion function, which may create a different set of
+/// [operator](Operator) depending on the vectorization state.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Algorithm {
+    /// Read an input array with the given layout.
+    ///
+    /// Crucial to read arrays that aren't contiguous and to perform correct broadcasting.
+    ReadGlobalWithLayout(ReadGlobalWithLayoutAlgo),
+    /// Read an input array.
+    ReadGlobal(ReadGlobalAlgo),
+}
+
+/// Settings for the [Algorithm::ReadGlobalWithLayout] variant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReadGlobalWithLayoutAlgo {
+    /// The array to be read.
+    pub global: Variable,
+    /// The layout to be used.
+    pub layout: Variable,
+    /// The output variable to write the result.
+    pub out: Variable,
+}
+
+/// Settings for the [Algorithm::ReadGlobal] variant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReadGlobalAlgo {
+    /// The array to be read.
+    pub global: Variable,
+    /// The output variable to write the result.
+    pub out: Variable,
+}
+
+/// All loop variants.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Loop {
+    /// A basic range loop.
+    Range(RangeLoop),
+}
+
+/// All metadata that can be access in a shader.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Metadata {
+    /// The stride of an array at the given dimension.
+    Stride {
+        dim: Variable,
+        var: Variable,
+        out: Variable,
+    },
+    /// The shape of an array at the given dimension.
+    Shape {
+        dim: Variable,
+        var: Variable,
+        out: Variable,
+    },
+}
+
+/// Settings for the [Loop::Range] variant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RangeLoop {
+    /// The loop index variable.
+    pub i: Variable,
+    /// The start value.
+    pub start: Variable,
+    /// The end value.
+    pub end: Variable,
+    /// The scope that contains all operations and variables declared in the loop body.
+    pub scope: Scope,
+}
+
+impl RangeLoop {
+    /// Registers a range loop to the given scope.
+    pub fn register<F: Fn(Variable, &mut Scope)>(
+        parent_scope: &mut Scope,
+        start: Variable,
+        end: Variable,
+        func: F,
+    ) {
+        let mut scope = parent_scope.child();
+        let index_ty = Item::Scalar(Elem::UInt);
+        let i = scope.create_local_undeclare(index_ty);
+
+        func(i, &mut scope);
+
+        let op = Self {
+            i,
+            start,
+            end,
+            scope,
+        };
+        parent_scope.register(Loop::Range(op));
+    }
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct BinaryOperation {
+pub struct BinaryOperator {
    pub lhs: Variable,
    pub rhs: Variable,
    pub out: Variable,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct UnaryOperation {
+pub struct UnaryOperator {
    pub input: Variable,
    pub out: Variable,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct ClampOperation {
+pub struct ClampOperator {
    pub input: Variable,
    pub min_value: Variable,
    pub max_value: Variable,
@ -58,8 +171,7 @@ pub struct ClampOperation {
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct ConditionalAssignOperation {
+pub struct ConditionalAssignOperator {
    pub cond: Variable,
    pub lhs: Variable,
    pub rhs: Variable,
@ -67,15 +179,37 @@ pub struct ConditionalAssignOperation {
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct ReadGlobalOperation {
+pub struct ReadGlobalOperator {
    pub variable: Variable,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[allow(dead_code)] // Some variants might not be used with different flags
-pub struct ReadGlobalWithLayoutOperation {
+pub struct ReadGlobalWithLayoutOperator {
    pub variable: Variable,
    pub tensor_read_pos: usize,
    pub tensor_layout_pos: usize,
 }
+
+impl From<Operator> for Operation {
+    fn from(val: Operator) -> Self {
+        Operation::Operator(val)
+    }
+}
+
+impl From<Metadata> for Operation {
+    fn from(val: Metadata) -> Self {
+        Operation::Metadata(val)
+    }
+}
+
+impl From<Algorithm> for Operation {
+    fn from(val: Algorithm) -> Self {
+        Operation::Algorithm(val)
+    }
+}
+
+impl From<Loop> for Operation {
+    fn from(val: Loop) -> Self {
+        Operation::Loop(val)
+    }
+}
--- a/burn-wgpu/src/codegen/dialect/gpu/scope.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/scope.rs
@ -0,0 +1,257 @@
+use super::{
+    Algorithm, Elem, Item, Operation, Operator, ReadGlobalAlgo, ReadGlobalWithLayoutAlgo,
+    UnaryOperator, Variable, Vectorization,
+};
+use serde::{Deserialize, Serialize};
+
+/// The scope is the main [operation](Operation) and [variable](Variable) container that simplify
+/// the process of reading inputs, creating local variables and adding new operations.
+///
+/// Notes:
+///
+/// This type isn't responsible for creating [shader bindings](super::Binding) and figuring out which
+/// variable can be written to.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Scope {
+    pub depth: u8,
+    pub operations: Vec<Operation>,
+    locals: Vec<Variable>,
+    reads_global: Vec<(Variable, ReadingStrategy, Variable)>,
+    writes_global: Vec<(Variable, Variable)>,
+    reads_scalar: Vec<(Variable, Variable)>,
+    output_ref: Option<Variable>,
+    undeclared: u16,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum ReadingStrategy {
+    /// Each element will be read in a way to be compatible with the output layout.
+    OutputLayout,
+    /// Keep the current layout.
+    Plain,
+}
+
+/// Information necessary when compiling a scope.
+pub struct ScopeProcessing {
+    /// The variable declarations.
+    pub variables: Vec<Variable>,
+    /// The operations.
+    pub operations: Vec<Operation>,
+}
+
+impl Scope {
+    /// Create a scope that is at the root of a
+    /// [compute shader](crate::codegen::dialect::gpu::ComputeShader).
+    ///
+    /// A local scope can be created with the [child](Self::child) method.
+    pub fn root() -> Self {
+        Self {
+            depth: 0,
+            operations: Vec::new(),
+            locals: Vec::new(),
+            reads_global: Vec::new(),
+            writes_global: Vec::new(),
+            reads_scalar: Vec::new(),
+            output_ref: None,
+            undeclared: 0,
+        }
+    }
+
+    /// Create a local variable of the given [item type](Item).
+    pub fn create_local<I: Into<Item>>(&mut self, item: I) -> Variable {
+        let item = item.into();
+        let index = self.new_local_index();
+        let local = Variable::Local(index, item, self.depth);
+        self.locals.push(local);
+        local
+    }
+
+    /// Create a new local variable, but doesn't perform the declaration.
+    ///
+    /// Useful for _for loops_ and other algorithms that require the control over initialization.
+    pub fn create_local_undeclare(&mut self, item: Item) -> Variable {
+        let index = self.new_local_index();
+        let local = Variable::Local(index, item, self.depth);
+        self.undeclared += 1;
+        local
+    }
+
+    /// Reads an input array to a local variable.
+    ///
+    /// The index refers to the argument position of the array in the compute shader.
+    pub fn read_array<I: Into<Item>>(&mut self, index: u16, item: I) -> Variable {
+        self.read_input_strategy(index, item.into(), ReadingStrategy::OutputLayout)
+    }
+
+    /// Reads an input scalar to a local variable.
+    ///
+    /// The index refers to the scalar position for the same [element](Elem) type.
+    pub fn read_scalar(&mut self, index: u16, elem: Elem) -> Variable {
+        let local = Variable::LocalScalar(self.new_local_scalar_index(), elem, self.depth);
+        let scalar = Variable::GlobalScalar(index, elem);
+
+        self.reads_scalar.push((local, scalar));
+
+        local
+    }
+
+    /// Retrieve the last local variable that was created.
+    pub fn last_local_index(&self) -> Option<&Variable> {
+        self.locals.last()
+    }
+
+    /// Vectorize the scope using the [vectorization](Vectorization) type.
+    ///
+    /// Notes:
+    ///
+    /// Scopes created _during_ compilation (after the tracing is done) should not be vectorized.
+    pub fn vectorize(&mut self, vectorization: Vectorization) {
+        self.operations
+            .iter_mut()
+            .for_each(|op| *op = op.vectorize(vectorization));
+        self.locals
+            .iter_mut()
+            .for_each(|var| *var = var.vectorize(vectorization));
+        self.reads_global.iter_mut().for_each(|(input, _, output)| {
+            *input = input.vectorize(vectorization);
+            *output = output.vectorize(vectorization);
+        });
+        self.writes_global.iter_mut().for_each(|(input, output)| {
+            *input = input.vectorize(vectorization);
+            *output = output.vectorize(vectorization);
+        });
+    }
+
+    /// Writes a variable to given output.
+    ///
+    /// Notes:
+    ///
+    /// This should only be used when doing compilation.
+    pub(crate) fn write_global(&mut self, input: Variable, output: Variable) {
+        if self.output_ref.is_none() {
+            self.output_ref = Some(output);
+        }
+        self.writes_global.push((input, output));
+    }
+
+    /// Update the [reading strategy](ReadingStrategy) for an input array.
+    ///
+    /// Notes:
+    ///
+    /// This should only be used when doing compilation.
+    pub(crate) fn update_read(&mut self, index: u16, strategy: ReadingStrategy) {
+        if let Some((_, strategy_old, _)) = self
+            .reads_global
+            .iter_mut()
+            .find(|(var, _, _)| var.index() == Some(index))
+        {
+            *strategy_old = strategy;
+        }
+    }
+
+    /// Register an [operation](Operation) into the scope.
+    pub fn register<T: Into<Operation>>(&mut self, operation: T) {
+        self.operations.push(operation.into())
+    }
+
+    /// Create an empty child scope.
+    pub fn child(&mut self) -> Self {
+        Self {
+            depth: self.depth + 1,
+            operations: Vec::new(),
+            locals: Vec::new(),
+            reads_global: Vec::new(),
+            writes_global: Vec::new(),
+            reads_scalar: Vec::new(),
+            output_ref: None,
+            undeclared: 0,
+        }
+    }
+
+    /// Returns the variables and operations to be declared and executed.
+    ///
+    /// Notes:
+    ///
+    /// New operations and variables can be created within the same scope without having name
+    /// conflicts.
+    pub fn process(&mut self) -> ScopeProcessing {
+        self.undeclared += self.locals.len() as u16;
+
+        let mut variables = Vec::new();
+        core::mem::swap(&mut self.locals, &mut variables);
+
+        let mut operations = Vec::new();
+
+        for (input, strategy, local) in self.reads_global.drain(..) {
+            match strategy {
+                ReadingStrategy::OutputLayout => {
+                    let output = self.output_ref.expect(
+                        "Output should be set when processing an input with output layout.",
+                    );
+                    operations.push(Operation::Algorithm(Algorithm::ReadGlobalWithLayout(
+                        ReadGlobalWithLayoutAlgo {
+                            global: input,
+                            layout: output,
+                            out: local,
+                        },
+                    )));
+                }
+                ReadingStrategy::Plain => operations.push(Operation::Algorithm(
+                    Algorithm::ReadGlobal(ReadGlobalAlgo {
+                        global: input,
+                        out: local,
+                    }),
+                )),
+            }
+        }
+
+        for (local, scalar) in self.reads_scalar.drain(..) {
+            operations.push(
+                Operator::AssignLocal(UnaryOperator {
+                    input: scalar,
+                    out: local,
+                })
+                .into(),
+            );
+            variables.push(local);
+        }
+
+        for op in self.operations.drain(..) {
+            operations.push(op);
+        }
+
+        for (input, out) in self.writes_global.drain(..) {
+            operations.push(Operation::Operator(Operator::AssignGlobal(UnaryOperator {
+                input,
+                out,
+            })))
+        }
+
+        ScopeProcessing {
+            variables,
+            operations,
+        }
+    }
+
+    fn new_local_index(&self) -> u16 {
+        self.locals.len() as u16 + self.undeclared
+    }
+
+    fn new_local_scalar_index(&self) -> u16 {
+        self.reads_scalar.len() as u16
+    }
+
+    fn read_input_strategy(
+        &mut self,
+        index: u16,
+        item: Item,
+        strategy: ReadingStrategy,
+    ) -> Variable {
+        let input = Variable::GlobalInputArray(index, item);
+        let index = self.new_local_index();
+        let local = Variable::Local(index, item, self.depth);
+        self.reads_global.push((input, strategy, local));
+        self.locals.push(local);
+        local
+    }
+}
--- a/burn-wgpu/src/codegen/dialect/gpu/shader.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/shader.rs
@ -1,8 +1,9 @@
-use super::Body;
 use crate::kernel::WORKGROUP_DEFAULT;
 use serde::{Deserialize, Serialize};
 use std::fmt::Display;

+use super::Scope;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub enum Location {
    Storage,
@ -16,7 +17,7 @@ pub enum Visibility {
    ReadWrite,
 }

-#[derive(Debug, Clone, PartialEq, Eq, Copy, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Copy, Hash, Serialize, Deserialize)]
 pub enum Elem {
    Float,
    Int,
@ -24,6 +25,12 @@ pub enum Elem {
    Bool,
 }

+impl From<Elem> for Item {
+    fn from(val: Elem) -> Self {
+        Item::Scalar(val)
+    }
+}
+
 impl Display for Elem {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
@ -87,5 +94,5 @@ pub struct ComputeShader {
    pub workgroup_size: WorkgroupSize,
    pub global_invocation_id: bool,
    pub num_workgroups: bool,
-    pub body: Body,
+    pub body: Scope,
 }
--- a/burn-wgpu/src/codegen/dialect/gpu/variable.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/variable.rs
@ -1,11 +1,47 @@
-use super::Item;
+use super::{Elem, Item};
 use serde::{Deserialize, Serialize};

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum Variable {
-    Input(u16, Item),
-    Scalar(u16, Item),
-    Local(u16, Item),
-    Output(u16, Item),
-    Constant(f64, Item),
+    GlobalInputArray(u16, Item),
+    GlobalScalar(u16, Elem),
+    GlobalOutputArray(u16, Item),
+    Local(u16, Item, u8),
+    LocalScalar(u16, Elem, u8),
+    ConstantScalar(f64, Elem),
+    Id,
+    Rank,
+}
+
+impl Variable {
+    pub fn const_value(&self) -> Option<f64> {
+        match self {
+            Variable::ConstantScalar(value, _) => Some(*value),
+            _ => None,
+        }
+    }
+    pub fn index(&self) -> Option<u16> {
+        match self {
+            Variable::GlobalInputArray(idx, _) => Some(*idx),
+            Variable::GlobalScalar(idx, _) => Some(*idx),
+            Variable::Local(idx, _, _) => Some(*idx),
+            Variable::LocalScalar(idx, _, _) => Some(*idx),
+            Variable::GlobalOutputArray(idx, _) => Some(*idx),
+            Variable::ConstantScalar(_, _) => None,
+            Variable::Id => None,
+            Variable::Rank => None,
+        }
+    }
+    pub fn item(&self) -> Item {
+        match self {
+            Variable::GlobalInputArray(_, item) => *item,
+            Variable::GlobalOutputArray(_, item) => *item,
+            Variable::GlobalScalar(_, elem) => Item::Scalar(*elem),
+            Variable::Local(_, item, _) => *item,
+            Variable::LocalScalar(_, elem, _) => Item::Scalar(*elem),
+            Variable::ConstantScalar(_, elem) => Item::Scalar(*elem),
+            Variable::Id => Item::Scalar(Elem::UInt),
+            Variable::Rank => Item::Scalar(Elem::UInt),
+        }
+    }
 }
--- a/burn-wgpu/src/codegen/dialect/gpu/vectorization.rs
+++ b/burn-wgpu/src/codegen/dialect/gpu/vectorization.rs
@ -1,11 +1,11 @@
 use super::{
-    BinaryOperation, ClampOperation, ConditionalAssignOperation, Item, Operation,
-    ReadGlobalOperation, ReadGlobalWithLayoutOperation, UnaryOperation, Variable,
+    Algorithm, BinaryOperator, ClampOperator, ConditionalAssignOperator, Item, Operation, Operator,
+    ReadGlobalAlgo, ReadGlobalWithLayoutAlgo, UnaryOperator, Variable,
 };

 /// Define a vectorization scheme.
 #[allow(dead_code)]
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Default)]
 pub enum Vectorization {
    /// Use vec4 for vectorization.
    Vec4,
@ -14,47 +14,99 @@ pub enum Vectorization {
    /// Use vec2 for vectorization.
    Vec2,
    /// Don't vectorize.
+    #[default]
    Scalar,
 }

 impl Operation {
    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
        match self {
-            Operation::Add(op) => Operation::Add(op.vectorize(vectorization)),
-            Operation::Sub(op) => Operation::Sub(op.vectorize(vectorization)),
-            Operation::Mul(op) => Operation::Mul(op.vectorize(vectorization)),
-            Operation::Div(op) => Operation::Div(op.vectorize(vectorization)),
-            Operation::Abs(op) => Operation::Abs(op.vectorize(vectorization)),
-            Operation::Exp(op) => Operation::Exp(op.vectorize(vectorization)),
-            Operation::Log(op) => Operation::Log(op.vectorize(vectorization)),
-            Operation::Log1p(op) => Operation::Log1p(op.vectorize(vectorization)),
-            Operation::Cos(op) => Operation::Cos(op.vectorize(vectorization)),
-            Operation::Sin(op) => Operation::Sin(op.vectorize(vectorization)),
-            Operation::Tanh(op) => Operation::Tanh(op.vectorize(vectorization)),
-            Operation::Powf(op) => Operation::Powf(op.vectorize(vectorization)),
-            Operation::Sqrt(op) => Operation::Sqrt(op.vectorize(vectorization)),
-            Operation::Erf(op) => Operation::Erf(op.vectorize(vectorization)),
-            Operation::Recip(op) => Operation::Recip(op.vectorize(vectorization)),
-            Operation::Equal(op) => Operation::Equal(op.vectorize(vectorization)),
-            Operation::Lower(op) => Operation::Lower(op.vectorize(vectorization)),
-            Operation::Clamp(op) => Operation::Clamp(op.vectorize(vectorization)),
-            Operation::Greater(op) => Operation::Greater(op.vectorize(vectorization)),
-            Operation::LowerEqual(op) => Operation::LowerEqual(op.vectorize(vectorization)),
-            Operation::GreaterEqual(op) => Operation::GreaterEqual(op.vectorize(vectorization)),
-            Operation::ConditionalAssign(op) => {
-                Operation::ConditionalAssign(op.vectorize(vectorization))
-            }
-            Operation::AssignGlobal(op) => Operation::AssignGlobal(op.vectorize(vectorization)),
-            Operation::AssignLocal(op) => Operation::AssignLocal(op.vectorize(vectorization)),
-            Operation::ReadGlobal(op) => Operation::ReadGlobal(op.vectorize(vectorization)),
-            Operation::ReadGlobalWithLayout(op) => {
-                Operation::ReadGlobalWithLayout(op.vectorize(vectorization))
-            }
+            Operation::Operator(op) => Operation::Operator(op.vectorize(vectorization)),
+            Operation::Algorithm(op) => Operation::Algorithm(op.vectorize(vectorization)),
+            Operation::Metadata(_) => panic!(
+                "Metadata can't be vectorized, they should only be generated after vectorization."
+            ),
+            Operation::Loop(_) => panic!(
+                "Loops can't be vectorized, they should only be generated after vectorization."
+            ),
        }
    }
 }

-impl BinaryOperation {
+impl Algorithm {
+    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
+        match self {
+            Algorithm::ReadGlobalWithLayout(op) => {
+                Algorithm::ReadGlobalWithLayout(op.vectorize(vectorization))
+            }
+            Algorithm::ReadGlobal(op) => Algorithm::ReadGlobal(op.vectorize(vectorization)),
+        }
+    }
+}
+
+impl ReadGlobalWithLayoutAlgo {
+    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
+        Self {
+            global: self.global.vectorize(vectorization),
+            layout: self.layout.vectorize(vectorization),
+            out: self.out.vectorize(vectorization),
+        }
+    }
+}
+
+impl ReadGlobalAlgo {
+    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
+        Self {
+            global: self.global.vectorize(vectorization),
+            out: self.out.vectorize(vectorization),
+        }
+    }
+}
+
+impl Operator {
+    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
+        match self {
+            Operator::Add(op) => Operator::Add(op.vectorize(vectorization)),
+            Operator::Index(op) => Operator::Index(op.vectorize(vectorization)),
+            Operator::Sub(op) => Operator::Sub(op.vectorize(vectorization)),
+            Operator::Mul(op) => Operator::Mul(op.vectorize(vectorization)),
+            Operator::Div(op) => Operator::Div(op.vectorize(vectorization)),
+            Operator::Abs(op) => Operator::Abs(op.vectorize(vectorization)),
+            Operator::Exp(op) => Operator::Exp(op.vectorize(vectorization)),
+            Operator::Log(op) => Operator::Log(op.vectorize(vectorization)),
+            Operator::Log1p(op) => Operator::Log1p(op.vectorize(vectorization)),
+            Operator::Cos(op) => Operator::Cos(op.vectorize(vectorization)),
+            Operator::Sin(op) => Operator::Sin(op.vectorize(vectorization)),
+            Operator::Tanh(op) => Operator::Tanh(op.vectorize(vectorization)),
+            Operator::Powf(op) => Operator::Powf(op.vectorize(vectorization)),
+            Operator::Sqrt(op) => Operator::Sqrt(op.vectorize(vectorization)),
+            Operator::Erf(op) => Operator::Erf(op.vectorize(vectorization)),
+            Operator::Recip(op) => Operator::Recip(op.vectorize(vectorization)),
+            Operator::Equal(op) => Operator::Equal(op.vectorize(vectorization)),
+            Operator::Lower(op) => Operator::Lower(op.vectorize(vectorization)),
+            Operator::Clamp(op) => Operator::Clamp(op.vectorize(vectorization)),
+            Operator::Greater(op) => Operator::Greater(op.vectorize(vectorization)),
+            Operator::LowerEqual(op) => Operator::LowerEqual(op.vectorize(vectorization)),
+            Operator::GreaterEqual(op) => Operator::GreaterEqual(op.vectorize(vectorization)),
+            Operator::ConditionalAssign(op) => {
+                Operator::ConditionalAssign(op.vectorize(vectorization))
+            }
+            Operator::AssignGlobal(op) => Operator::AssignGlobal(op.vectorize(vectorization)),
+            Operator::AssignLocal(op) => {
+                if let Variable::GlobalScalar(_, _) = op.input {
+                    // Assign will not change the type of the output if the input can't be
+                    // vectorized.
+                    return Operator::AssignLocal(op.clone());
+                }
+
+                Operator::AssignLocal(op.vectorize(vectorization))
+            }
+            Operator::Modulo(op) => Operator::Modulo(op.vectorize(vectorization)),
+        }
+    }
+}
+
+impl BinaryOperator {
    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
        let lhs = self.lhs.vectorize(vectorization);
        let rhs = self.rhs.vectorize(vectorization);
@ -64,7 +116,7 @@ impl BinaryOperation {
    }
 }

-impl UnaryOperation {
+impl UnaryOperator {
    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
        let input = self.input.vectorize(vectorization);
        let out = self.out.vectorize(vectorization);
@ -73,7 +125,7 @@ impl UnaryOperation {
    }
 }

-impl ClampOperation {
+impl ClampOperator {
    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
        let input = self.input.vectorize(vectorization);
        let out = self.out.vectorize(vectorization);
@ -89,7 +141,7 @@ impl ClampOperation {
    }
 }

-impl ConditionalAssignOperation {
+impl ConditionalAssignOperator {
    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
        let cond = self.cond.vectorize(vectorization);
        let lhs = self.lhs.vectorize(vectorization);
@ -105,39 +157,23 @@ impl ConditionalAssignOperation {
    }
 }

-impl ReadGlobalOperation {
-    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
-        let variable = self.variable.vectorize(vectorization);
-
-        Self { variable }
-    }
-}
-
-impl ReadGlobalWithLayoutOperation {
-    pub fn vectorize(&self, vectorization: Vectorization) -> Self {
-        let variable = self.variable.vectorize(vectorization);
-        let tensor_read_pos = self.tensor_read_pos;
-        let tensor_layout_pos = self.tensor_layout_pos;
-
-        Self {
-            variable,
-            tensor_read_pos,
-            tensor_layout_pos,
-        }
-    }
-}
-
 impl Variable {
    pub fn vectorize(&self, vectorize: Vectorization) -> Self {
        match self {
-            Variable::Input(index, item) => Variable::Input(*index, item.vectorize(vectorize)),
-            Variable::Local(index, item) => Variable::Local(*index, item.vectorize(vectorize)),
-            Variable::Output(index, item) => Variable::Output(*index, item.vectorize(vectorize)),
-            Variable::Constant(index, item) => {
-                Variable::Constant(*index, item.vectorize(vectorize))
+            Variable::GlobalInputArray(index, item) => {
+                Variable::GlobalInputArray(*index, item.vectorize(vectorize))
            }
-            Variable::Scalar(index, item) => Variable::Scalar(*index, *item), // Don't vectorize
-                                                                              // scalar variables.
+            Variable::Local(index, item, name) => {
+                Variable::Local(*index, item.vectorize(vectorize), *name)
+            }
+            Variable::GlobalOutputArray(index, item) => {
+                Variable::GlobalOutputArray(*index, item.vectorize(vectorize))
+            }
+            Variable::ConstantScalar(_, _) => *self,
+            Variable::GlobalScalar(_, _) => *self,
+            Variable::Id => *self,
+            Variable::Rank => *self,
+            Variable::LocalScalar(_, _, _) => *self,
        }
    }
 }
--- a/burn-wgpu/src/codegen/dialect/wgsl/base.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/base.rs
@ -3,11 +3,22 @@ use std::fmt::Display;

 #[derive(Debug, Clone)]
 pub enum Variable {
-    Input(u16, Item),
-    Scalar(u16, Item, gpu::Elem),
-    Local(u16, Item),
-    Output(u16, Item),
-    Constant(f64, Item),
+    GlobalInputArray(u16, Item),
+    GlobalOutputArray(u16, Item),
+    GlobalScalar(u16, Elem, gpu::Elem),
+    ConstantScalar(f64, Elem),
+    Local {
+        index: u16,
+        item: Item,
+        scope_depth: u8,
+    },
+    LocalScalar {
+        index: u16,
+        elem: Elem,
+        scope_depth: u8,
+    },
+    Id,
+    Rank,
 }

 #[derive(Debug, Clone, PartialEq, Eq, Copy)]
@ -33,6 +44,26 @@ pub struct IndexedVariable {
 }

 impl Variable {
+    pub fn is_always_scalar(&self) -> bool {
+        match self {
+            Variable::GlobalScalar(_, _, _) => true,
+            Variable::ConstantScalar(_, _) => true,
+            Variable::LocalScalar {
+                index: _,
+                elem: _,
+                scope_depth: _,
+            } => true,
+            Variable::Id => true,
+            Variable::Rank => true,
+            Variable::GlobalInputArray(_, _) => false,
+            Variable::GlobalOutputArray(_, _) => false,
+            Variable::Local {
+                index: _,
+                item: _,
+                scope_depth: _,
+            } => false,
+        }
+    }
    pub fn index(&self, index: usize) -> IndexedVariable {
        IndexedVariable {
            var: self.clone(),
@ -40,15 +71,29 @@ impl Variable {
        }
    }

-    pub fn item(&self) -> &Item {
+    pub fn item(&self) -> Item {
        match self {
-            Self::Input(_, e) => e,
-            Self::Scalar(_, e, _) => e,
-            Self::Local(_, e) => e,
-            Self::Output(_, e) => e,
-            Self::Constant(_, e) => e,
+            Self::GlobalInputArray(_, e) => *e,
+            Self::GlobalOutputArray(_, e) => *e,
+            Self::Local {
+                index: _,
+                item,
+                scope_depth: _,
+            } => *item,
+            Self::ConstantScalar(_, e) => Item::Scalar(*e),
+            Self::GlobalScalar(_, e, _) => Item::Scalar(*e),
+            Self::Id => Item::Scalar(Elem::U32),
+            Self::Rank => Item::Scalar(Elem::U32),
+            Self::LocalScalar {
+                index: _,
+                elem,
+                scope_depth: _,
+            } => Item::Scalar(*elem),
        }
    }
+    pub fn elem(&self) -> Elem {
+        *self.item().elem()
+    }
 }

 impl Item {
@ -98,39 +143,24 @@ impl Display for Item {
 impl Display for Variable {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Variable::Input(number, _) => f.write_fmt(format_args!("input_{number}")),
-            Variable::Local(number, _) => f.write_fmt(format_args!("local_{number}")),
-            Variable::Output(number, _) => f.write_fmt(format_args!("output_{number}")),
-            Variable::Scalar(number, _, elem) => {
+            Variable::GlobalInputArray(number, _) => f.write_fmt(format_args!("input_{number}")),
+            Variable::LocalScalar {
+                index,
+                elem: _,
+                scope_depth,
+            } => f.write_fmt(format_args!("s_{scope_depth}_{index}")),
+            Variable::Local {
+                index,
+                item: _,
+                scope_depth,
+            } => f.write_fmt(format_args!("l_{scope_depth}_{index}")),
+            Variable::GlobalOutputArray(number, _) => f.write_fmt(format_args!("output_{number}")),
+            Variable::GlobalScalar(number, _, elem) => {
                f.write_fmt(format_args!("scalars_{elem}[{number}]"))
            }
-            Variable::Constant(number, item) => match item {
-                Item::Vec4(elem) => f.write_fmt(format_args!(
-                    "
-vec4(
-    {elem}({number}),
-    {elem}({number}),
-    {elem}({number}),
-    {elem}({number}),
-)"
-                )),
-                Item::Vec3(elem) => f.write_fmt(format_args!(
-                    "
-vec3(
-    {elem}({number}),
-    {elem}({number}),
-    {elem}({number}),
-)"
-                )),
-                Item::Vec2(elem) => f.write_fmt(format_args!(
-                    "
-vec2(
-    {elem}({number}),
-    {elem}({number}),
-)"
-                )),
-                Item::Scalar(elem) => f.write_fmt(format_args!("{elem}({number})")),
-            },
+            Variable::ConstantScalar(number, elem) => f.write_fmt(format_args!("{elem}({number})")),
+            Variable::Id => f.write_str("id"),
+            Variable::Rank => f.write_str("rank"),
        }
    }
 }
@ -149,8 +179,8 @@ impl Display for IndexedVariable {
        let index = self.index;

        match self.var {
-            Variable::Scalar(_, _, _) => f.write_fmt(format_args!("{var}")),
-            _ => match should_index(item) {
+            Variable::GlobalScalar(_, _, _) => f.write_fmt(format_args!("{var}")),
+            _ => match should_index(&item) {
                true => f.write_fmt(format_args!("{var}[{index}]")),
                false => f.write_fmt(format_args!("{var}")),
            },
--- a/burn-wgpu/src/codegen/dialect/wgsl/body.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/body.rs
@ -1,4 +1,4 @@
-use super::Operation;
+use super::Instruction;
 use std::fmt::Display;

 /// A body is composed of a list of [operations](Operation).
@ -6,11 +6,11 @@ use std::fmt::Display;
 /// Note that the body assumes that the kernel will run on a 2D grid defined by the workgroup size
 /// X and Y, but with Z=1.
 #[derive(Debug, Clone)]
-pub struct Body {
-    pub operators: Vec<Operation>,
+pub struct Scope {
+    pub operators: Vec<Instruction>,
 }

-impl Display for Body {
+impl Display for Scope {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(
            "let id = global_id.y * (num_workgroups.x * WORKGROUP_SIZE_X) + global_id.x;\n",
@ -19,7 +19,6 @@ impl Display for Body {

        for ops in self.operators.iter() {
            f.write_fmt(format_args!("{ops}"))?;
-            f.write_str("\n")?;
        }

        Ok(())
--- a/burn-wgpu/src/codegen/dialect/wgsl/compiler.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/compiler.rs
@ -11,6 +11,8 @@ use std::marker::PhantomData;
 /// Wgsl Compiler.
 #[derive(Clone)]
 pub struct Compiler<F: FloatElement, I: IntElement> {
+    num_inputs: usize,
+    num_outputs: usize,
    _float: PhantomData<F>,
    _int: PhantomData<I>,
 }
@ -24,6 +26,8 @@ impl<F: FloatElement, I: IntElement> core::fmt::Debug for Compiler<F, I> {
 impl<F: FloatElement, I: IntElement> Default for Compiler<F, I> {
    fn default() -> Self {
        Self {
+            num_inputs: 0,
+            num_outputs: 0,
            _float: PhantomData,
            _int: PhantomData,
        }
@ -37,7 +41,8 @@ impl<F: FloatElement, I: IntElement> compiler::Compiler for Compiler<F, I> {
    type FullPrecisionCompiler = Compiler<f32, i32>;

    fn compile(shader: gpu::ComputeShader) -> Self::Representation {
-        Self::compile_shader(shader)
+        let mut compiler = Self::default();
+        compiler.compile_shader(shader)
    }

    fn elem_size(elem: gpu::Elem) -> usize {
@ -46,6 +51,37 @@ impl<F: FloatElement, I: IntElement> compiler::Compiler for Compiler<F, I> {
 }

 impl<F: FloatElement, I: IntElement> Compiler<F, I> {
+    fn compile_shader(&mut self, mut value: gpu::ComputeShader) -> wgsl::ComputeShader {
+        self.num_inputs = value.inputs.len();
+        self.num_outputs = value.outputs.len();
+
+        let body = self.compile_scope(&mut value.body);
+        let extensions = register_extensions(&body);
+
+        wgsl::ComputeShader {
+            inputs: value
+                .inputs
+                .into_iter()
+                .map(Self::compile_binding)
+                .collect(),
+            outputs: value
+                .outputs
+                .into_iter()
+                .map(Self::compile_binding)
+                .collect(),
+            named: value
+                .named
+                .into_iter()
+                .map(|(name, binding)| (name, Self::compile_binding(binding)))
+                .collect(),
+            workgroup_size: value.workgroup_size,
+            global_invocation_id: value.global_invocation_id,
+            num_workgroups: value.num_workgroups,
+            body,
+            extensions,
+        }
+    }
+
    fn compile_item(item: gpu::Item) -> Item {
        match item {
            gpu::Item::Vec4(elem) => wgsl::Item::Vec4(Self::compile_elem(elem)),
@ -66,155 +102,252 @@ impl<F: FloatElement, I: IntElement> Compiler<F, I> {

    fn compile_variable(value: gpu::Variable) -> wgsl::Variable {
        match value {
-            gpu::Variable::Input(index, item) => {
-                wgsl::Variable::Input(index, Self::compile_item(item))
+            gpu::Variable::GlobalInputArray(index, item) => {
+                wgsl::Variable::GlobalInputArray(index, Self::compile_item(item))
            }
-            gpu::Variable::Scalar(index, item) => {
-                let elem = item.elem();
-                wgsl::Variable::Scalar(index, Self::compile_item(item), elem)
+            gpu::Variable::GlobalScalar(index, elem) => {
+                wgsl::Variable::GlobalScalar(index, Self::compile_elem(elem), elem)
            }
-            gpu::Variable::Local(index, item) => {
-                wgsl::Variable::Local(index, Self::compile_item(item))
+            gpu::Variable::Local(index, item, scope_depth) => wgsl::Variable::Local {
+                index,
+                item: Self::compile_item(item),
+                scope_depth,
+            },
+            gpu::Variable::LocalScalar(index, elem, scope_depth) => wgsl::Variable::LocalScalar {
+                index,
+                elem: Self::compile_elem(elem),
+                scope_depth,
+            },
+            gpu::Variable::GlobalOutputArray(index, item) => {
+                wgsl::Variable::GlobalOutputArray(index, Self::compile_item(item))
            }
-            gpu::Variable::Output(index, item) => {
-                wgsl::Variable::Output(index, Self::compile_item(item))
+            gpu::Variable::ConstantScalar(index, elem) => {
+                wgsl::Variable::ConstantScalar(index, Self::compile_elem(elem))
            }
-            gpu::Variable::Constant(index, item) => {
-                wgsl::Variable::Constant(index, Self::compile_item(item))
+            gpu::Variable::Id => wgsl::Variable::Id,
+            gpu::Variable::Rank => wgsl::Variable::Rank,
+        }
+    }
+
+    fn compile_scope(&self, value: &mut gpu::Scope) -> wgsl::Scope {
+        let mut operations = Vec::new();
+        let processing = value.process();
+
+        for var in processing.variables {
+            operations.push(wgsl::Instruction::DeclareVariable {
+                var: Self::compile_variable(var),
+            });
+        }
+
+        processing
+            .operations
+            .into_iter()
+            .for_each(|op| self.compile_operation(&mut operations, op, value));
+
+        wgsl::Scope {
+            operators: operations,
+        }
+    }
+
+    fn compile_operation(
+        &self,
+        instructions: &mut Vec<wgsl::Instruction>,
+        operation: gpu::Operation,
+        scope: &mut gpu::Scope,
+    ) {
+        match operation {
+            gpu::Operation::Operator(op) => instructions.push(self.compile_instruction(op)),
+            gpu::Operation::Algorithm(algo) => self.compile_algorithm(instructions, algo, scope),
+            gpu::Operation::Metadata(op) => instructions.push(self.compile_metadata(op)),
+            gpu::Operation::Loop(val) => instructions.push(self.compile_loop(val)),
+        }
+    }
+
+    fn compile_algorithm(
+        &self,
+        instructions: &mut Vec<wgsl::Instruction>,
+        algo: gpu::Algorithm,
+        scope: &mut gpu::Scope,
+    ) {
+        let mut compile = |scope: &mut gpu::Scope| {
+            let compiled = self.compile_scope(scope).operators;
+            instructions.extend(compiled);
+        };
+
+        match algo {
+            gpu::Algorithm::ReadGlobalWithLayout(algo) => {
+                algo.expand(scope);
+                compile(scope);
+            }
+            gpu::Algorithm::ReadGlobal(algo) => {
+                algo.expand(scope);
+                compile(scope);
            }
        }
    }

-    fn compile_body(value: gpu::Body) -> wgsl::Body {
-        wgsl::Body {
-            operators: value
-                .operators
-                .into_iter()
-                .map(Self::compile_operation)
-                .collect(),
+    fn compile_loop(&self, loop_val: gpu::Loop) -> wgsl::Instruction {
+        match loop_val {
+            gpu::Loop::Range(mut range_loop) => wgsl::Instruction::RangeLoop {
+                i: Self::compile_variable(range_loop.i),
+                start: Self::compile_variable(range_loop.start),
+                end: Self::compile_variable(range_loop.end),
+                instructions: self.compile_scope(&mut range_loop.scope).operators,
+            },
        }
    }

-    fn compile_operation(value: gpu::Operation) -> wgsl::Operation {
+    fn compile_metadata(&self, metadata: gpu::Metadata) -> wgsl::Instruction {
+        match metadata {
+            gpu::Metadata::Stride { dim, var, out } => {
+                let position = match var {
+                    gpu::Variable::GlobalInputArray(idx, _) => idx as usize,
+                    gpu::Variable::GlobalOutputArray(idx, _) => self.num_inputs + idx as usize,
+                    _ => panic!("Only Input and Output have a stride, got: {:?}", var),
+                };
+                wgsl::Instruction::Stride {
+                    dim: Self::compile_variable(dim),
+                    position,
+                    out: Self::compile_variable(out),
+                }
+            }
+            gpu::Metadata::Shape { dim, var, out } => {
+                let position = match var {
+                    gpu::Variable::GlobalInputArray(idx, _) => idx as usize,
+                    gpu::Variable::GlobalOutputArray(idx, _) => self.num_inputs + idx as usize,
+                    _ => panic!("Only Input and Output have a shape, got {:?}", var),
+                };
+                wgsl::Instruction::Shape {
+                    dim: Self::compile_variable(dim),
+                    position,
+                    out: Self::compile_variable(out),
+                }
+            }
+        }
+    }
+
+    fn compile_instruction(&self, value: gpu::Operator) -> wgsl::Instruction {
        match value {
-            gpu::Operation::Add(op) => wgsl::Operation::Add {
+            gpu::Operator::Add(op) => wgsl::Instruction::Add {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Sub(op) => wgsl::Operation::Sub {
+            gpu::Operator::Index(op) => wgsl::Instruction::Index {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Mul(op) => wgsl::Operation::Mul {
+            gpu::Operator::Modulo(op) => wgsl::Instruction::Modulo {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Div(op) => wgsl::Operation::Div {
+            gpu::Operator::Sub(op) => wgsl::Instruction::Sub {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Abs(op) => wgsl::Operation::Abs {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Exp(op) => wgsl::Operation::Exp {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Log(op) => wgsl::Operation::Log {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Log1p(op) => wgsl::Operation::Log1p {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Cos(op) => wgsl::Operation::Cos {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Sin(op) => wgsl::Operation::Sin {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Tanh(op) => wgsl::Operation::Tanh {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Powf(op) => wgsl::Operation::Powf {
+            gpu::Operator::Mul(op) => wgsl::Instruction::Mul {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Sqrt(op) => wgsl::Operation::Sqrt {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Erf(op) => wgsl::Operation::Erf {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Recip(op) => wgsl::Operation::Recip {
-                input: Self::compile_variable(op.input),
-                out: Self::compile_variable(op.out),
-            },
-            gpu::Operation::Equal(op) => wgsl::Operation::Equal {
+            gpu::Operator::Div(op) => wgsl::Instruction::Div {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Lower(op) => wgsl::Operation::Lower {
+            gpu::Operator::Abs(op) => wgsl::Instruction::Abs {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Exp(op) => wgsl::Instruction::Exp {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Log(op) => wgsl::Instruction::Log {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Log1p(op) => wgsl::Instruction::Log1p {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Cos(op) => wgsl::Instruction::Cos {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Sin(op) => wgsl::Instruction::Sin {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Tanh(op) => wgsl::Instruction::Tanh {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Powf(op) => wgsl::Instruction::Powf {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Clamp(op) => wgsl::Operation::Clamp {
+            gpu::Operator::Sqrt(op) => wgsl::Instruction::Sqrt {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Erf(op) => wgsl::Instruction::Erf {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Recip(op) => wgsl::Instruction::Recip {
+                input: Self::compile_variable(op.input),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Equal(op) => wgsl::Instruction::Equal {
+                lhs: Self::compile_variable(op.lhs),
+                rhs: Self::compile_variable(op.rhs),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Lower(op) => wgsl::Instruction::Lower {
+                lhs: Self::compile_variable(op.lhs),
+                rhs: Self::compile_variable(op.rhs),
+                out: Self::compile_variable(op.out),
+            },
+            gpu::Operator::Clamp(op) => wgsl::Instruction::Clamp {
                input: Self::compile_variable(op.input),
                min_value: Self::compile_variable(op.min_value),
                max_value: Self::compile_variable(op.max_value),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::Greater(op) => wgsl::Operation::Greater {
+            gpu::Operator::Greater(op) => wgsl::Instruction::Greater {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::LowerEqual(op) => wgsl::Operation::LowerEqual {
+            gpu::Operator::LowerEqual(op) => wgsl::Instruction::LowerEqual {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::GreaterEqual(op) => wgsl::Operation::GreaterEqual {
+            gpu::Operator::GreaterEqual(op) => wgsl::Instruction::GreaterEqual {
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::ConditionalAssign(op) => wgsl::Operation::ConditionalAssign {
+            gpu::Operator::ConditionalAssign(op) => wgsl::Instruction::ConditionalAssign {
                cond: Self::compile_variable(op.cond),
                lhs: Self::compile_variable(op.lhs),
                rhs: Self::compile_variable(op.rhs),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::AssignGlobal(op) => wgsl::Operation::AssignGlobal {
+            gpu::Operator::AssignGlobal(op) => wgsl::Instruction::AssignGlobal {
                input: Self::compile_variable(op.input),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::AssignLocal(op) => wgsl::Operation::AssignLocal {
+            gpu::Operator::AssignLocal(op) => wgsl::Instruction::AssignLocal {
                input: Self::compile_variable(op.input),
                out: Self::compile_variable(op.out),
            },
-            gpu::Operation::ReadGlobal(op) => wgsl::Operation::ReadGlobal {
-                variable: Self::compile_variable(op.variable),
-            },
-            gpu::Operation::ReadGlobalWithLayout(op) => wgsl::Operation::ReadGlobalWithLayout {
-                variable: Self::compile_variable(op.variable),
-                tensor_read_pos: op.tensor_read_pos,
-                tensor_layout_pos: op.tensor_layout_pos,
-            },
        }
    }

@ -240,37 +373,9 @@ impl<F: FloatElement, I: IntElement> Compiler<F, I> {
            size: value.size,
        }
    }
-
-    fn compile_shader(value: gpu::ComputeShader) -> wgsl::ComputeShader {
-        let body = Self::compile_body(value.body);
-        let extensions = register_extensions(&body);
-
-        wgsl::ComputeShader {
-            inputs: value
-                .inputs
-                .into_iter()
-                .map(Self::compile_binding)
-                .collect(),
-            outputs: value
-                .outputs
-                .into_iter()
-                .map(Self::compile_binding)
-                .collect(),
-            named: value
-                .named
-                .into_iter()
-                .map(|(name, binding)| (name, Self::compile_binding(binding)))
-                .collect(),
-            workgroup_size: value.workgroup_size,
-            global_invocation_id: value.global_invocation_id,
-            num_workgroups: value.num_workgroups,
-            body,
-            extensions,
-        }
-    }
 }

-fn register_extensions(body: &wgsl::Body) -> Vec<wgsl::Extension> {
+fn register_extensions(body: &wgsl::Scope) -> Vec<wgsl::Extension> {
    let mut extensions = Vec::new();

    let mut register_extension = |extension: wgsl::Extension| {
@ -282,20 +387,21 @@ fn register_extensions(body: &wgsl::Body) -> Vec<wgsl::Extension> {
    // Since not all operators are native to WGSL, we need to add the custom ones.
    for op in body.operators.iter() {
        match op {
-            wgsl::Operation::Powf { lhs: _, rhs, out } => match rhs {
-                wgsl::Variable::Scalar(_, _, _) => {
-                    register_extension(wgsl::Extension::PowfScalar(*out.item()));
+            wgsl::Instruction::Powf { lhs: _, rhs, out } => {
+                register_extension(wgsl::Extension::PowfPrimitive(out.item()));
+
+                if rhs.is_always_scalar() {
+                    register_extension(wgsl::Extension::PowfScalar(out.item()));
+                } else {
+                    register_extension(wgsl::Extension::Powf(out.item()));
                }
-                _ => {
-                    register_extension(wgsl::Extension::Powf(*out.item()));
-                }
-            },
-            wgsl::Operation::Erf { input, out: _ } => {
-                register_extension(wgsl::Extension::Erf(*input.item()));
+            }
+            wgsl::Instruction::Erf { input, out: _ } => {
+                register_extension(wgsl::Extension::Erf(input.item()));
            }
            #[cfg(target_os = "macos")]
-            wgsl::Operation::Tanh { input, out: _ } => {
-                register_extension(wgsl::Extension::SafeTanh(*input.item()))
+            wgsl::Instruction::Tanh { input, out: _ } => {
+                register_extension(wgsl::Extension::SafeTanh(input.item()))
            }
            _ => {}
        }
--- a/burn-wgpu/src/codegen/dialect/wgsl/extension.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/extension.rs
@ -5,6 +5,7 @@ use std::fmt::Display;
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub enum Extension {
    PowfScalar(Item),
+    PowfPrimitive(Item),
    Powf(Item),
    Erf(Item),
    #[cfg(target_os = "macos")]
@ -15,6 +16,7 @@ impl Display for Extension {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Extension::PowfScalar(elem) => format_powf_scalar(f, elem),
+            Extension::PowfPrimitive(elem) => format_powf_primitive(f, elem),
            Extension::Powf(elem) => format_powf(f, elem),
            Extension::Erf(elem) => format_erf(f, elem),
            #[cfg(target_os = "macos")]
@ -24,12 +26,10 @@ impl Display for Extension {
 }

 fn format_powf_scalar(f: &mut core::fmt::Formatter<'_>, item: &Item) -> core::fmt::Result {
-    base_powf_fmt(f, item)?;
-
    match item {
        Item::Vec4(elem) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf_scalar(lhs: {item}, rhs: {elem}) -> {item} {{
    return vec4(
        powf_primitive(lhs[0], rhs),
        powf_primitive(lhs[1], rhs),
@ -41,7 +41,7 @@ fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
        )),
        Item::Vec3(elem) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf_scalar(lhs: {item}, rhs: {elem}) -> {item} {{
    return vec3(
        powf_primitive(lhs[0], rhs),
        powf_primitive(lhs[1], rhs),
@ -52,7 +52,7 @@ fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
        )),
        Item::Vec2(elem) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf_scalar(lhs: {item}, rhs: {elem}) -> {item} {{
    return vec2(
        powf_primitive(lhs[0], rhs),
        powf_primitive(lhs[1], rhs),
@ -62,7 +62,7 @@ fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
        )),
        Item::Scalar(elem) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {elem}, rhs: {elem}) -> {elem} {{
+fn powf_scalar(lhs: {elem}, rhs: {elem}) -> {elem} {{
    return powf_primitive(lhs, rhs);
 }}
 "
@ -70,7 +70,10 @@ fn powf(lhs: {elem}, rhs: {elem}) -> {elem} {{
    }
 }

-fn base_powf_fmt(f: &mut std::fmt::Formatter<'_>, item: &Item) -> Result<(), std::fmt::Error> {
+fn format_powf_primitive(
+    f: &mut std::fmt::Formatter<'_>,
+    item: &Item,
+) -> Result<(), std::fmt::Error> {
    let elem = item.elem();
    f.write_fmt(format_args!(
        "
@ -96,12 +99,10 @@ fn powf_primitive(lhs: {elem}, rhs: {elem}) -> {elem} {{
 }

 fn format_powf(f: &mut core::fmt::Formatter<'_>, item: &Item) -> core::fmt::Result {
-    base_powf_fmt(f, item)?;
-
    match item {
-        Item::Vec4(elem) => f.write_fmt(format_args!(
+        Item::Vec4(_) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf(lhs: {item}, rhs: {item}) -> {item} {{
    return vec4(
        powf_primitive(lhs[0], rhs[0]),
        powf_primitive(lhs[1], rhs[1]),
@ -111,9 +112,9 @@ fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
 }}
 "
        )),
-        Item::Vec3(elem) => f.write_fmt(format_args!(
+        Item::Vec3(_) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf(lhs: {item}, rhs: {item}) -> {item} {{
    return vec3(
        powf_primitive(lhs[0], rhs[0]),
        powf_primitive(lhs[1], rhs[1]),
@ -122,9 +123,9 @@ fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
 }}
 "
        )),
-        Item::Vec2(elem) => f.write_fmt(format_args!(
+        Item::Vec2(_) => f.write_fmt(format_args!(
            "
-fn powf(lhs: {item}, rhs: {elem}) -> {item} {{
+fn powf(lhs: {item}, rhs: {item}) -> {item} {{
    return vec2(
        powf_primitive(lhs[0], rhs[0]),
        powf_primitive(lhs[1], rhs[1]),
--- a/burn-wgpu/src/codegen/dialect/wgsl/operations.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/operations.rs
@ -1,15 +1,28 @@
 use super::base::{Item, Variable};
 use std::fmt::Display;

-/// All operations that can be used in a WGSL compute shader.
+/// All instructions that can be used in a WGSL compute shader.
 #[derive(Debug, Clone)]
 #[allow(dead_code)] // Some variants might not be used with different flags
-pub enum Operation {
+pub enum Instruction {
+    DeclareVariable {
+        var: Variable,
+    },
    Add {
        lhs: Variable,
        rhs: Variable,
        out: Variable,
    },
+    Index {
+        lhs: Variable,
+        rhs: Variable,
+        out: Variable,
+    },
+    Modulo {
+        lhs: Variable,
+        rhs: Variable,
+        out: Variable,
+    },
    Sub {
        lhs: Variable,
        rhs: Variable,
@ -115,72 +128,99 @@ pub enum Operation {
        input: Variable,
        out: Variable,
    },
-    ReadGlobal {
-        variable: Variable,
+    Stride {
+        dim: Variable,
+        position: usize,
+        out: Variable,
    },
-    /// Read the tensor in a way to be compatible with another tensor layout.
-    ReadGlobalWithLayout {
-        variable: Variable,
-        tensor_read_pos: usize,
-        tensor_layout_pos: usize,
+    Shape {
+        dim: Variable,
+        position: usize,
+        out: Variable,
+    },
+    RangeLoop {
+        i: Variable,
+        start: Variable,
+        end: Variable,
+        instructions: Vec<Instruction>,
    },
 }

-impl Display for Operation {
+impl Display for Instruction {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Operation::Add { lhs, rhs, out } => {
-                f.write_fmt(format_args!("let {out} = {lhs} + {rhs};"))
+            Instruction::DeclareVariable { var } => {
+                let item = var.item();
+                f.write_fmt(format_args!("var {var}: {item};\n"))
            }
-            Operation::Sub { lhs, rhs, out } => {
-                f.write_fmt(format_args!("let {out} = {lhs} - {rhs};"))
+            Instruction::Add { lhs, rhs, out } => {
+                f.write_fmt(format_args!("{out} = {lhs} + {rhs};\n"))
            }
-            Operation::Mul { lhs, rhs, out } => {
-                f.write_fmt(format_args!("let {out} = {lhs} * {rhs};"))
+            Instruction::Index { lhs, rhs, out } => {
+                let item = out.item();
+                let lhs = match lhs {
+                    Variable::GlobalInputArray(index, _) => format!("input_{index}_global"),
+                    Variable::GlobalOutputArray(index, _) => format!("output_{index}_global"),
+                    _ => format!("{lhs}"),
+                };
+                f.write_fmt(format_args!("{out} = {item}({lhs}[{rhs}]);\n"))
            }
-            Operation::Div { lhs, rhs, out } => {
-                f.write_fmt(format_args!("let {out} = {lhs} / {rhs};"))
+            Instruction::Modulo { lhs, rhs, out } => {
+                f.write_fmt(format_args!("{out} = {lhs} % {rhs};\n"))
            }
-            Operation::Abs { input, out } => f.write_fmt(format_args!("let {out} = abs({input});")),
-            Operation::Exp { input, out } => f.write_fmt(format_args!("let {out} = exp({input});")),
-            Operation::Log { input, out } => f.write_fmt(format_args!("let {out} = log({input});")),
-            Operation::Clamp {
+            Instruction::Sub { lhs, rhs, out } => {
+                f.write_fmt(format_args!("{out} = {lhs} - {rhs};\n"))
+            }
+            Instruction::Mul { lhs, rhs, out } => {
+                f.write_fmt(format_args!("{out} = {lhs} * {rhs};\n"))
+            }
+            Instruction::Div { lhs, rhs, out } => {
+                f.write_fmt(format_args!("{out} = {lhs} / {rhs};\n"))
+            }
+            Instruction::Abs { input, out } => f.write_fmt(format_args!("{out} = abs({input});\n")),
+            Instruction::Exp { input, out } => f.write_fmt(format_args!("{out} = exp({input});\n")),
+            Instruction::Log { input, out } => f.write_fmt(format_args!("{out} = log({input});\n")),
+            Instruction::Clamp {
                input,
                min_value,
                max_value,
                out,
            } => f.write_fmt(format_args!(
-                "let {out} = clamp({input}, {min_value}, {max_value});"
+                "{out} = clamp({input}, {min_value}, {max_value});\n"
            )),
-            Operation::Powf { lhs, rhs, out } => {
-                f.write_fmt(format_args!("let {out} = powf({lhs}, {rhs});"))
+            Instruction::Powf { lhs, rhs, out } => {
+                if rhs.is_always_scalar() {
+                    f.write_fmt(format_args!("{out} = powf_scalar({lhs}, {rhs});\n"))
+                } else {
+                    f.write_fmt(format_args!("{out} = powf({lhs}, {rhs});\n"))
+                }
            }
-            Operation::Sqrt { input, out } => {
-                f.write_fmt(format_args!("let {out} = sqrt({input});"))
+            Instruction::Sqrt { input, out } => {
+                f.write_fmt(format_args!("{out} = sqrt({input});\n"))
            }
-            Operation::Log1p { input, out } => {
-                f.write_fmt(format_args!("let {out} = log({input} + 1.0);"))
+            Instruction::Log1p { input, out } => {
+                f.write_fmt(format_args!("{out} = log({input} + 1.0);\n"))
            }
-            Operation::Cos { input, out } => f.write_fmt(format_args!("let {out} = cos({input});")),
-            Operation::Sin { input, out } => f.write_fmt(format_args!("let {out} = sin({input});")),
-            Operation::Tanh { input, out } => {
+            Instruction::Cos { input, out } => f.write_fmt(format_args!("{out} = cos({input});\n")),
+            Instruction::Sin { input, out } => f.write_fmt(format_args!("{out} = sin({input});\n")),
+            Instruction::Tanh { input, out } => {
                #[cfg(target_os = "macos")]
-                let result = f.write_fmt(format_args!("let {out} = safe_tanh({input});"));
+                let result = f.write_fmt(format_args!("{out} = safe_tanh({input});\n"));
                #[cfg(not(target_os = "macos"))]
-                let result = f.write_fmt(format_args!("let {out} = tanh({input});"));
+                let result = f.write_fmt(format_args!("{out} = tanh({input});\n"));

                result
            }
-            Operation::Erf { input, out } => f.write_fmt(format_args!("let {out} = erf({input});")),
-            Operation::Recip { input, out } => {
-                f.write_fmt(format_args!("let {out} = 1.0 / {input};"))
+            Instruction::Erf { input, out } => f.write_fmt(format_args!("{out} = erf({input});\n")),
+            Instruction::Recip { input, out } => {
+                f.write_fmt(format_args!("{out} = 1.0 / {input};"))
            }
-            Operation::Equal { lhs, rhs, out } => comparison(lhs, rhs, out, "==", f),
-            Operation::Lower { lhs, rhs, out } => comparison(lhs, rhs, out, "<", f),
-            Operation::Greater { lhs, rhs, out } => comparison(lhs, rhs, out, ">", f),
-            Operation::LowerEqual { lhs, rhs, out } => comparison(lhs, rhs, out, "<=", f),
-            Operation::GreaterEqual { lhs, rhs, out } => comparison(lhs, rhs, out, ">=", f),
-            Operation::AssignGlobal { input, out } => {
+            Instruction::Equal { lhs, rhs, out } => comparison(lhs, rhs, out, "==", f),
+            Instruction::Lower { lhs, rhs, out } => comparison(lhs, rhs, out, "<", f),
+            Instruction::Greater { lhs, rhs, out } => comparison(lhs, rhs, out, ">", f),
+            Instruction::LowerEqual { lhs, rhs, out } => comparison(lhs, rhs, out, "<=", f),
+            Instruction::GreaterEqual { lhs, rhs, out } => comparison(lhs, rhs, out, ">=", f),
+            Instruction::AssignGlobal { input, out } => {
                let elem_out = out.item();
                let elem_in = input.item();

@ -211,76 +251,18 @@ impl Display for Operation {
 );"
                        )),
                        Item::Scalar(elem) => {
-                            f.write_fmt(format_args!("{out}_global[id] = {elem}({input});"))
+                            f.write_fmt(format_args!("{out}_global[id] = {elem}({input});\n"))
                        }
                    }
                } else {
-                    f.write_fmt(format_args!("{out}_global[id] = {elem_out}({input});"))
+                    f.write_fmt(format_args!("{out}_global[id] = {elem_out}({input});\n"))
                }
            }
-            Operation::AssignLocal { input, out } => {
-                let elem = out.item();
-                f.write_fmt(format_args!("let {out} = {elem}({input});"))
+            Instruction::AssignLocal { input, out } => {
+                let item = out.item();
+                f.write_fmt(format_args!("{out} = {item}({input});\n"))
            }
-            Operation::ReadGlobal { variable } => match variable {
-                Variable::Input(number, _elem) => f.write_fmt(format_args!(
-                    "let input_{number} = input_{number}_global[id];"
-                )),
-                Variable::Local(_, _) => panic!("can't read global local variable."),
-                Variable::Output(number, _elem) => f.write_fmt(format_args!(
-                    "let output_{number} = output_{number}_global[id];"
-                )),
-                Variable::Scalar(_, _, _) => panic!("Can't read global scalar variable."),
-                Variable::Constant(_, _) => panic!("Can't read global constant variable."),
-            },
-            Operation::ReadGlobalWithLayout {
-                variable,
-                tensor_read_pos: position,
-                tensor_layout_pos: position_out,
-            } => {
-                let (global, local, elem) = match variable {
-                    Variable::Input(number, elem) => (
-                        format!("input_{number}_global"),
-                        format!("input_{number}"),
-                        elem,
-                    ),
-                    Variable::Local(_, _) => panic!("can't read global local variable."),
-                    Variable::Output(number, elem) => (
-                        format!("output_{number}_global"),
-                        format!("output_{number}"),
-                        elem,
-                    ),
-                    Variable::Scalar(_, _, _) => panic!("Can't read global scalar variable."),
-                    Variable::Constant(_, _) => panic!("Can't read global constant variable."),
-                };
-
-                let offset = match elem {
-                    Item::Vec4(_) => 4,
-                    Item::Vec3(_) => 3,
-                    Item::Vec2(_) => 2,
-                    Item::Scalar(_) => 1,
-                };
-
-                f.write_fmt(format_args!(
-                    "
-var index_{local}: u32 = 0u;
-
-for (var i: u32 = 1u; i <= rank; i++) {{
-    let position = {position}u * (2u * rank);
-    let position_out = {position_out}u * (2u * rank);
-
-    let stride = info[position + i];
-    let stride_out = info[position_out + i];
-    let shape = info[position + rank + i];
-
-    index_{local} += (id * {offset}u) / stride_out % shape * stride;
-}}
-
-let {local} = {elem}({global}[index_{local} /  {offset}u]);
-"
-                ))
-            }
-            Operation::ConditionalAssign {
+            Instruction::ConditionalAssign {
                cond,
                lhs,
                rhs,
@ -301,7 +283,6 @@ let {local} = {elem}({global}[index_{local} /  {offset}u]);

                        f.write_fmt(format_args!(
                            "
-var {out}: {elem};
 if {cond}[0] {{
    {out}[0] = {lhs0};
 }} else {{
@ -335,7 +316,6 @@ if {cond}[3] {{

                        f.write_fmt(format_args!(
                            "
-var {out}: {elem};
 if {cond}[0] {{
    {out}[0] = {lhs0};
 }} else {{
@ -362,7 +342,6 @@ if {cond}[2] {{

                        f.write_fmt(format_args!(
                            "
-var {out}: {elem};
 if {cond}[0] {{
    {out}[0] = {lhs0};
 }} else {{
@ -378,7 +357,6 @@ if {cond}[1] {{
                    }
                    Item::Scalar(_) => f.write_fmt(format_args!(
                        "
-var {out}: {elem};
 if {cond} {{
    {out} = {lhs};
 }} else {{
@ -388,6 +366,29 @@ if {cond} {{
                    )),
                }
            }
+            Instruction::Stride { dim, position, out } => f.write_fmt(format_args!(
+                "{out} = info[({position}u * (2u * rank)) + {dim} + 1u];\n"
+            )),
+            Instruction::Shape { dim, position, out } => f.write_fmt(format_args!(
+                "{out} = info[({position}u * (2u * rank)) + rank + {dim} + 1u];\n"
+            )),
+            Instruction::RangeLoop {
+                i,
+                start,
+                end,
+                instructions,
+            } => {
+                f.write_fmt(format_args!(
+                    "
+for (var {i}: u32 = {start}; {i} < {end}; {i}++) {{
+"
+                ))?;
+                for instruction in instructions {
+                    f.write_fmt(format_args!("{instruction}"))?;
+                }
+
+                f.write_str("}\n")
+            }
        }
    }
 }
@ -412,7 +413,7 @@ fn comparison(

            f.write_fmt(format_args!(
                "
-let {out} = vec4({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2}, {lhs3} {op} {rhs3});
+{out} = vec4({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2}, {lhs3} {op} {rhs3});
 "
            ))
        }
@ -426,7 +427,7 @@ let {out} = vec4({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2}, {lh

            f.write_fmt(format_args!(
                "
-let {out} = vec3({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2});
+{out} = vec3({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2});
 "
            ))
        }
@ -438,12 +439,12 @@ let {out} = vec3({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1}, {lhs2} {op} {rhs2});

            f.write_fmt(format_args!(
                "
-let {out} = vec2({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1});
+{out} = vec2({lhs0} {op} {rhs0}, {lhs1} {op} {rhs1});
 "
            ))
        }
        Item::Scalar(_) => match rhs.item() {
-            Item::Scalar(_) => f.write_fmt(format_args!("let {out} = {lhs} {op} {rhs};")),
+            Item::Scalar(_) => f.write_fmt(format_args!("{out} = {lhs} {op} {rhs};\n")),
            _ => panic!("Can only compare a scalar when the output is a scalar"),
        },
    }
--- a/burn-wgpu/src/codegen/dialect/wgsl/shader.rs
+++ b/burn-wgpu/src/codegen/dialect/wgsl/shader.rs
@ -1,4 +1,4 @@
-use super::{Body, Extension, Item};
+use super::{Extension, Item, Scope};
 use crate::codegen::dialect::gpu::WorkgroupSize;
 use std::fmt::Display;

@ -31,7 +31,7 @@ pub struct ComputeShader {
    pub workgroup_size: WorkgroupSize,
    pub global_invocation_id: bool,
    pub num_workgroups: bool,
-    pub body: Body,
+    pub body: Scope,
    pub extensions: Vec<Extension>,
 }

--- a/burn-wgpu/src/codegen/kernel.rs
+++ b/burn-wgpu/src/codegen/kernel.rs
@ -1,343 +1,8 @@
-use burn_compute::client::ComputeClient;
-
-use crate::codegen::dialect::gpu::{
-    Binding, Body, ComputeShader, Elem, Item, Location, Operation, ReadGlobalOperation,
-    ReadGlobalWithLayoutOperation, UnaryOperation, Variable, Vectorization, Visibility,
-    WorkgroupSize,
-};
 use crate::compute::StaticKernel;
 use crate::element::JitElement;
 use crate::kernel::{elemwise_workgroup, StaticKernelSource, WORKGROUP_DEFAULT};
 use crate::Runtime;
-use std::marker::PhantomData;
-
-/// Kernel creation input phase, see [kernel codegen](ElemWiseKernelCodegen) for more details.
-pub struct InputPhase;
-/// Kernel creation body phase, see [kernel codegen](ElemWiseKernelCodegen) for more details.
-pub struct BodyPhase;
-/// Kernel creation output phase, see [kernel codegen](ElemWiseKernelCodegen) for more details.
-pub struct OutputPhase;
-/// Kernel compilation phase, see [kernel codegen](ElemWiseKernelCodegen) for more details.
-pub struct CompilationPhase;
-
-#[derive(new, Clone, Copy)]
-pub struct InplaceMapping {
-    pub position_input: usize,
-    pub position_output: usize,
-}
-
-/// Allows to create custom wgsl kernels based on configured inputs, body and outputs.
-///
-/// This type has 4 phases that must be executed in order, but no worry the type system won't allow
-/// you to make mistakes.
-///
-///   1. [Input Phase](InputPhase)
-///     This phase focuses on registering the input arrays and scalars that are going to be used by
-///     the kernel.
-///   2. [Body Phase](BodyPhase)
-///     After the input phase is done, all the operations that happen in the body must be
-///     registered.
-///   3. [Output Phase](OutputPhase)
-///     This step focuses on registering all output arrays or inputs that the kernel needs to write to.
-///   4. [Compilation Phase](CompilationPhase)
-///     Now that all other phases are completed, we can actually compile the kernel.
-pub struct ElemWiseKernelCodegen<Phase = InputPhase> {
-    operations: Vec<Operation>,
-    input_bindings: Vec<Binding>,
-    output_bindings: Vec<Binding>,
-    named_bindings: Vec<(String, Binding)>,
-    vectorization: Vectorization,
-    mappings_inplace: Vec<InplaceMapping>,
-    workgroup_size: WorkgroupSize,
-    _phase: PhantomData<Phase>,
-}
-
-pub enum Input {
-    Array {
-        item: Item,
-        visibility: Visibility,
-        strategy: ReadingStrategy,
-    },
-    Scalar {
-        elem: Elem,
-        size: usize,
-    },
-}
-
-pub enum ReadingStrategy {
-    /// Each element will be read in a way to be compatible with the output layout.
-    OutputLayout,
-    /// Keep the current layout.
-    Plain,
-}
-
-#[derive(Clone)]
-pub enum Output {
-    Array { item: Item, local: u16 },
-    Input { item: Item, input: u16, local: u16 },
-}
-
-impl Default for ElemWiseKernelCodegen<InputPhase> {
-    fn default() -> Self {
-        Self {
-            operations: Vec::new(),
-            input_bindings: Vec::new(),
-            output_bindings: Vec::new(),
-            named_bindings: Vec::new(),
-            vectorization: Vectorization::Scalar,
-            mappings_inplace: Vec::new(),
-            workgroup_size: WorkgroupSize::default(),
-            _phase: PhantomData,
-        }
-    }
-}
-
-impl ElemWiseKernelCodegen<InputPhase> {
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    #[allow(dead_code)]
-    pub fn vectorize(mut self, vectorization: Vectorization) -> Self {
-        self.vectorization = vectorization;
-        self
-    }
-
-    #[allow(dead_code)]
-    pub fn inplace(mut self, mappings: &[InplaceMapping]) -> Self {
-        self.mappings_inplace = mappings.to_vec();
-        self
-    }
-
-    /// Register the inputs used by the kernel.
-    pub fn inputs(mut self, inputs: &[Input]) -> ElemWiseKernelCodegen<BodyPhase> {
-        let mut index: u16 = 0;
-
-        for input in inputs {
-            match input {
-                Input::Array {
-                    item,
-                    visibility,
-                    strategy,
-                } => {
-                    let item = item.vectorize(self.vectorization);
-
-                    self.input_bindings.push(Binding {
-                        item: bool_item(item),
-                        visibility: *visibility,
-                        location: Location::Storage,
-                        size: None,
-                    });
-
-                    match strategy {
-                        ReadingStrategy::OutputLayout => {
-                            self.operations.push(Operation::ReadGlobalWithLayout(
-                                ReadGlobalWithLayoutOperation {
-                                    variable: Variable::Input(index, item),
-                                    tensor_read_pos: index as usize,
-                                    tensor_layout_pos: 0, // Will set the right value during the output
-                                                          // phase.
-                                },
-                            ));
-                        }
-                        ReadingStrategy::Plain => {
-                            self.operations
-                                .push(Operation::ReadGlobal(ReadGlobalOperation {
-                                    variable: Variable::Input(index, item),
-                                }));
-                        }
-                    }
-
-                    index += 1;
-                }
-                Input::Scalar { elem, size } => {
-                    let elem = bool_elem(*elem);
-
-                    self.named_bindings.push((
-                        format!("scalars_{}", elem),
-                        Binding {
-                            item: Item::Scalar(elem),
-                            visibility: Visibility::Read,
-                            location: Location::Storage,
-                            size: Some(*size),
-                        },
-                    ));
-                }
-            }
-        }
-
-        ElemWiseKernelCodegen {
-            operations: self.operations,
-            input_bindings: self.input_bindings,
-            output_bindings: self.output_bindings,
-            named_bindings: self.named_bindings,
-            vectorization: self.vectorization,
-            mappings_inplace: self.mappings_inplace,
-            workgroup_size: self.workgroup_size,
-            _phase: PhantomData,
-        }
-    }
-}
-
-impl ElemWiseKernelCodegen<BodyPhase> {
-    /// Register the [operators](Operator) that the kernel must execute in the order provided.
-    pub fn body(mut self, operators: &[Operation]) -> ElemWiseKernelCodegen<OutputPhase> {
-        for ops in operators.iter() {
-            self.operations.push(ops.vectorize(self.vectorization));
-        }
-
-        ElemWiseKernelCodegen {
-            operations: self.operations,
-            input_bindings: self.input_bindings,
-            output_bindings: self.output_bindings,
-            named_bindings: self.named_bindings,
-            vectorization: self.vectorization,
-            mappings_inplace: self.mappings_inplace,
-            workgroup_size: self.workgroup_size,
-            _phase: PhantomData,
-        }
-    }
-}
-
-impl ElemWiseKernelCodegen<OutputPhase> {
-    /// Register the outputs with their local variable index.
-    ///
-    /// Note that the index corresponds to the registered [operator](Operator) number at the
-    /// [body phase](BodyPhase).
-    /// So the 4th operator registered creates the local variable 3 (N-1, since the 1th index is 0).
-    pub fn outputs(mut self, outputs: &[Output]) -> ElemWiseKernelCodegen<CompilationPhase> {
-        let mut index = 0;
-        let mut position_out = 0;
-
-        let mut outputs = outputs.to_vec();
-
-        for mapping in self.mappings_inplace.iter() {
-            match outputs.get_mut(mapping.position_output) {
-                Some(output) => match output {
-                    Output::Array { item, local } => {
-                        *output = Output::Input {
-                            item: *item,
-                            input: mapping.position_input as u16,
-                            local: *local,
-                        };
-                    }
-                    Output::Input {
-                        item: _,
-                        input: _,
-                        local: _,
-                    } => continue,
-                },
-                None => continue,
-            }
-
-            if let Some(binding) = self.input_bindings.get_mut(mapping.position_input) {
-                binding.visibility = Visibility::ReadWrite
-            }
-        }
-
-        for array in &outputs {
-            match array {
-                Output::Array { item, local } => {
-                    let item = item.vectorize(self.vectorization);
-                    let elem_adapted = bool_item(item);
-
-                    self.output_bindings.push(Binding {
-                        item: elem_adapted,
-                        visibility: Visibility::ReadWrite,
-                        location: Location::Storage,
-                        size: None,
-                    });
-                    self.operations
-                        .push(Operation::AssignGlobal(UnaryOperation {
-                            input: Variable::Local(*local, item),
-                            out: Variable::Output(index, elem_adapted),
-                        }));
-                    index += 1;
-
-                    if index == 1 {
-                        position_out = self.input_bindings.len(); // First output when we have a
-                                                                  // new array for the output.
-                    }
-                }
-                Output::Input { item, input, local } => {
-                    let item = item.vectorize(self.vectorization);
-
-                    self.operations
-                        .push(Operation::AssignGlobal(UnaryOperation {
-                            input: Variable::Local(*local, item),
-                            out: Variable::Input(*input, bool_item(item)),
-                        }));
-                    position_out = *input as usize; // Input number when we use inplace operation.
-                }
-            }
-        }
-
-        // We set the output number that will be used for the stride definition.
-        for i in 0..self.input_bindings.len() {
-            if let Some(Operation::ReadGlobalWithLayout(ReadGlobalWithLayoutOperation {
-                variable: _,
-                tensor_read_pos: _,
-                tensor_layout_pos,
-            })) = self.operations.get_mut(i)
-            {
-                {
-                    *tensor_layout_pos = position_out;
-                }
-            };
-        }
-
-        ElemWiseKernelCodegen {
-            operations: self.operations,
-            input_bindings: self.input_bindings,
-            output_bindings: self.output_bindings,
-            named_bindings: self.named_bindings,
-            vectorization: self.vectorization,
-            mappings_inplace: self.mappings_inplace,
-            workgroup_size: self.workgroup_size,
-            _phase: PhantomData,
-        }
-    }
-}
-
-impl ElemWiseKernelCodegen<CompilationPhase> {
-    #[allow(dead_code)] // Only used for fusion for now.
-    pub fn workgroup_size(mut self, workgroup_size: WorkgroupSize) -> Self {
-        self.workgroup_size = workgroup_size;
-        self
-    }
-
-    /// Compile the kernel into a [compute shader](ComputeShader).
-    pub fn compile(self) -> ComputeShader {
-        let inputs = self.input_bindings;
-        let outputs = self.output_bindings;
-        let mut named = Vec::with_capacity(2);
-
-        named.push((
-            "info".to_string(),
-            Binding {
-                item: Item::Scalar(Elem::UInt),
-                visibility: Visibility::Read,
-                location: Location::Storage,
-                size: None, // We avoid putting the length here since it will force a new kernel
-                            // for each tensor rank.
-            },
-        ));
-
-        for (name, binding) in self.named_bindings.into_iter() {
-            named.push((name, binding));
-        }
-
-        ComputeShader {
-            inputs,
-            outputs,
-            named,
-            workgroup_size: self.workgroup_size,
-            body: Body::new(self.operations),
-            num_workgroups: true,
-            global_invocation_id: true,
-        }
-    }
-}
+use burn_compute::client::ComputeClient;

 #[derive(new)]
 pub struct StaticHandle<'a, R: Runtime> {
@ -433,20 +98,3 @@ pub(crate) fn calculate_num_elems_dyn_rank(shape: &[usize]) -> usize {
    }
    num_elems
 }
-
-fn bool_item(ty: Item) -> Item {
-    match ty {
-        Item::Vec4(elem) => Item::Vec4(bool_elem(elem)),
-        Item::Vec3(elem) => Item::Vec3(bool_elem(elem)),
-        Item::Vec2(elem) => Item::Vec2(bool_elem(elem)),
-        Item::Scalar(elem) => Item::Scalar(bool_elem(elem)),
-    }
-}
-
-fn bool_elem(elem: Elem) -> Elem {
-    match elem {
-        // U32 are used for bool tensors
-        Elem::Bool => Elem::UInt,
-        _ => elem,
-    }
-}
--- a/burn-wgpu/src/codegen/mod.rs
+++ b/burn-wgpu/src/codegen/mod.rs
@ -1,7 +1,9 @@
+mod compilation;
 pub(crate) mod compiler;
 pub(crate) mod dialect;

 mod kernel;

+pub(crate) use compilation::*;
 pub(crate) use compiler::*;
 pub(crate) use kernel::*;
--- a/burn-wgpu/src/compute/kernel.rs
+++ b/burn-wgpu/src/compute/kernel.rs
@ -75,7 +75,7 @@ mod tests {
    use super::*;
    use crate::{
        binary,
-        codegen::dialect::gpu::{BinaryOperation, Elem, Item, Operation, Variable},
+        codegen::dialect::gpu::{BinaryOperator, Elem, Operator, Scope},
        kernel::{KernelSettings, WORKGROUP_DEFAULT},
        tests::{TestCompiler, TestRuntime},
        Runtime, WgpuDevice,
@ -84,10 +84,10 @@ mod tests {
    #[test]
    fn can_run_kernel() {
        binary!(
-            operation: |elem: Elem| Operation::Add(BinaryOperation {
-                lhs: Variable::Input(0, Item::Scalar(elem)),
-                rhs: Variable::Input(1, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Add(BinaryOperator {
+                lhs: scope.read_array(0, elem),
+                rhs: scope.read_array(1, elem),
+                out: scope.create_local(elem),
            }),
            compiler: TestCompiler,
            elem_in: f32,
--- a/burn-wgpu/src/fusion/base.rs
+++ b/burn-wgpu/src/fusion/base.rs
@ -70,7 +70,7 @@ impl<R: Runtime> FusionBackend for JitBackend<R> {
    type OptimizationState = WgpuOptimizationState;
    type Optimization = WgpuOptimization<R>;
    type FusionDevice = R::Device;
-    type Handle = WgpuFusionHandle<R>;
+    type Handle = JitFusionHandle<R>;
    type FusionClient = MutexFusionClient<Self>;

    fn optimizations(
@ -126,7 +126,7 @@ pub fn strides_dyn_rank(shape: &[usize]) -> Vec<usize> {
 }

 /// Handle to be used when fusing operations.
-pub struct WgpuFusionHandle<R: Runtime> {
+pub struct JitFusionHandle<R: Runtime> {
    /// Compute client for wgpu.
    pub client: ComputeClient<R::Server, R::Channel>,
    /// The buffer where the data are stored.
@ -136,13 +136,17 @@ pub struct WgpuFusionHandle<R: Runtime> {
    pub(crate) strides: Vec<usize>,
 }

-impl<R: Runtime> core::fmt::Debug for WgpuFusionHandle<R> {
-    fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        todo!()
+impl<R: Runtime> core::fmt::Debug for JitFusionHandle<R> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_fmt(format_args!(
+            "JitFusionHandle {{ device: {:?}, runtime: {}}}",
+            self.device,
+            R::name(),
+        ))
    }
 }

-impl<R: Runtime> Clone for WgpuFusionHandle<R> {
+impl<R: Runtime> Clone for JitFusionHandle<R> {
    fn clone(&self) -> Self {
        Self {
            client: self.client.clone(),
@ -153,10 +157,10 @@ impl<R: Runtime> Clone for WgpuFusionHandle<R> {
    }
 }

-unsafe impl<R: Runtime> Send for WgpuFusionHandle<R> {}
-unsafe impl<R: Runtime> Sync for WgpuFusionHandle<R> {}
+unsafe impl<R: Runtime> Send for JitFusionHandle<R> {}
+unsafe impl<R: Runtime> Sync for JitFusionHandle<R> {}

-impl<R: Runtime> WgpuFusionHandle<R> {
+impl<R: Runtime> JitFusionHandle<R> {
    pub(crate) fn into_tensor<const D: usize, E: JitElement>(
        self,
        shape: Shape<D>,
@ -172,7 +176,7 @@ impl<R: Runtime> WgpuFusionHandle<R> {
    }
 }

-impl<R: Runtime, E: JitElement, const D: usize> From<JitTensor<R, E, D>> for WgpuFusionHandle<R> {
+impl<R: Runtime, E: JitElement, const D: usize> From<JitTensor<R, E, D>> for JitFusionHandle<R> {
    fn from(value: JitTensor<R, E, D>) -> Self {
        Self {
            client: value.client,
--- a/burn-wgpu/src/fusion/elemwise/builder.rs
+++ b/burn-wgpu/src/fusion/elemwise/builder.rs
@ -1,11 +1,10 @@
-use super::{optimization::ElementWise, CompilationPhase, Scalars};
+use super::{optimization::ElementWise, CompilationPhase};
 use crate::{
    codegen::dialect::gpu::{
-        BinaryOperation, ConditionalAssignOperation, Elem, Item, Operation, UnaryOperation,
-        Variable,
+        BinaryOperator, ConditionalAssignOperator, Elem, Operator, UnaryOperator, Variable,
    },
    element::JitElement,
-    fusion::WgpuOptimization,
+    fusion::{tracing::TraceBuilder, WgpuOptimization},
    JitBackend, Runtime,
 };
 use burn_fusion::{
@ -14,27 +13,20 @@ use burn_fusion::{
        NumericOperationDescription, OperationDescription, ScalarOperationDescription,
        UnaryOperationDescription,
    },
-    OptimizationBuilder, OptimizationProperties, OptimizationStatus, TensorDescription, TensorId,
+    OptimizationBuilder, OptimizationProperties, OptimizationStatus, TensorDescription,
 };
 use burn_tensor::{
    ops::{FloatElem, IntElem},
    Device, Element,
 };
-use hashbrown::HashMap;

 /// Fused element wise operations that are normally memory bound.
 pub(crate) struct ElementWiseBuilder<R: Runtime> {
-    pub(crate) inputs: Vec<TensorDescription>,
-    pub(crate) locals: HashMap<TensorId, u16>,
-    pub(crate) tensors: HashMap<TensorId, (TensorDescription, Elem)>,
-    pub(crate) scalars_float: usize,
-    pub(crate) scalars_int: usize,
-    pub(crate) scalars_uint: usize,
-    pub(crate) booleans: usize,
-    pub(crate) operators: Vec<Operation>,
-    pub(crate) current_output_shape: Vec<usize>,
-    pub(crate) status: OptimizationStatus,
-    pub(crate) device: R::Device,
+    builder: TraceBuilder,
+    current_output_shape: Vec<usize>,
+    status: OptimizationStatus,
+    num_added: usize,
+    device: R::Device,
 }

 impl<R: Runtime> OptimizationBuilder<WgpuOptimization<R>> for ElementWiseBuilder<R> {
@ -81,22 +73,13 @@ impl<R: Runtime> OptimizationBuilder<WgpuOptimization<R>> for ElementWiseBuilder
        };

        self.status = OptimizationStatus::Open;
+        self.num_added += 1;
    }

    fn build(&self) -> WgpuOptimization<R> {
-        let inputs = self.input_descriptions();
-        let outputs = self.output_descriptions();
-        let locals = outputs
-            .iter()
-            .map(|out| *self.locals.get(&out.0.id).unwrap())
-            .collect::<Vec<_>>();
-
        let op = ElementWise::new(
-            inputs,
-            outputs,
-            locals,
-            Scalars::new(self.scalars_float, self.scalars_uint, self.scalars_int),
-            self.operators.clone(),
+            self.builder.clone().build(),
+            self.num_added,
            self.device.clone(),
            CompilationPhase,
        );
@ -105,18 +88,12 @@ impl<R: Runtime> OptimizationBuilder<WgpuOptimization<R>> for ElementWiseBuilder
    }

    fn len(&self) -> usize {
-        self.operators.len()
+        self.num_added
    }

    fn reset(&mut self) {
-        self.inputs.clear();
-        self.locals.drain();
-        self.tensors.clear();
-        self.scalars_float = 0;
-        self.scalars_int = 0;
-        self.scalars_uint = 0;
-        self.booleans = 0;
-        self.operators.clear();
+        self.builder = TraceBuilder::new();
+        self.num_added = 0;
        self.status = OptimizationStatus::Open;
        self.current_output_shape.clear();
    }
@ -126,11 +103,11 @@ impl<R: Runtime> OptimizationBuilder<WgpuOptimization<R>> for ElementWiseBuilder
    }

    fn properties(&self) -> OptimizationProperties {
-        let ready = !self.operators.is_empty();
+        let ready = self.num_added > 0;

        OptimizationProperties {
            ready,
-            score: self.operators.len() as u64,
+            score: self.num_added as u64,
        }
    }
 }
@ -138,269 +115,20 @@ impl<R: Runtime> OptimizationBuilder<WgpuOptimization<R>> for ElementWiseBuilder
 impl<R: Runtime> ElementWiseBuilder<R> {
    pub fn new(device: Device<JitBackend<R>>) -> Self {
        Self {
-            inputs: Vec::new(),
-            locals: HashMap::new(),
-            tensors: HashMap::new(),
-            scalars_float: 0,
-            scalars_int: 0,
-            scalars_uint: 0,
-            booleans: 0,
-            operators: Vec::new(),
+            builder: TraceBuilder::new(),
+            num_added: 0,
            current_output_shape: Vec::new(),
            status: OptimizationStatus::Open,
            device,
        }
    }

-    fn input_descriptions(&self) -> Vec<(TensorDescription, Elem)> {
-        self.inputs
-            .iter()
-            .map(|input| {
-                let updated_tensor = self.tensors.get(&input.id).unwrap();
-                updated_tensor.clone()
-            })
-            .collect::<Vec<_>>()
-    }
-
-    fn output_descriptions(&self) -> Vec<(TensorDescription, Elem)> {
-        let mut outputs = Vec::new();
-        let mut local_tensor_ids_input = Vec::new();
-        let mut local_tensor_ids_output = Vec::new();
-
-        // Mark a variable to the provided list of tensor ids using the variable list.
-        //
-        // Only local variables can become outputs.
-        let mark = |var: &Variable, list: &mut Vec<TensorId>| {
-            if let Variable::Local(index, _) = var {
-                if let Some((id, _)) = self
-                    .locals
-                    .iter()
-                    .find(|(_id, position)| *position == index)
-                {
-                    if !list.contains(id) {
-                        list.push(*id);
-                    }
-                }
-            }
-        };
-        let mark_binary =
-            |op: &BinaryOperation, inputs: &mut Vec<TensorId>, outputs: &mut Vec<TensorId>| {
-                mark(&op.lhs, inputs);
-                mark(&op.rhs, inputs);
-                mark(&op.out, outputs);
-            };
-        let mark_unary =
-            |op: &UnaryOperation, inputs: &mut Vec<TensorId>, outputs: &mut Vec<TensorId>| {
-                mark(&op.input, inputs);
-                mark(&op.out, outputs);
-            };
-
-        // For all operators, mark their local tensor id in the proper set.
-        for ops in self.operators.iter() {
-            match ops {
-                Operation::AssignGlobal(_) => {
-                    // Nothing to do here.
-                }
-                Operation::AssignLocal(op) => {
-                    mark(&op.out, &mut local_tensor_ids_output);
-                }
-                Operation::ReadGlobalWithLayout(_) => {
-                    // Nothing to do here.
-                }
-                Operation::ReadGlobal(_) => {
-                    // Nothing to do here.
-                }
-                Operation::Add(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Sub(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Mul(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Div(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Exp(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Abs(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Erf(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Log(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Log1p(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Cos(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Sin(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Tanh(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Clamp(op) => {
-                    mark(&op.input, &mut local_tensor_ids_input);
-                    mark(&op.out, &mut local_tensor_ids_output);
-                }
-                Operation::Powf(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Recip(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Lower(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Greater(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::LowerEqual(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::GreaterEqual(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::Equal(op) => mark_binary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-                Operation::ConditionalAssign(op) => {
-                    mark(&op.cond, &mut local_tensor_ids_input);
-                    mark(&op.lhs, &mut local_tensor_ids_input);
-                    mark(&op.rhs, &mut local_tensor_ids_input);
-                    mark(&op.out, &mut local_tensor_ids_output);
-                }
-                Operation::Sqrt(op) => mark_unary(
-                    op,
-                    &mut local_tensor_ids_input,
-                    &mut local_tensor_ids_output,
-                ),
-            }
-        }
-
-        // All output tensors that are never read by a following operation should be written to
-        // since they are essentially the "logical" output of the shader.
-        for out in local_tensor_ids_output {
-            let is_read = local_tensor_ids_input.contains(&out);
-
-            if !is_read {
-                outputs.push(self.tensors.get(&out).unwrap().clone());
-            }
-        }
-
-        // All tensors where their latest description is read only should be written to since they
-        // are going to be used after the fused kernel by other operations.
-        for entry in self.tensors.values() {
-            let (tensor, _) = &entry;
-            if let burn_fusion::TensorStatus::ReadOnly = tensor.status {
-                if self.locals.contains_key(&tensor.id) {
-                    outputs.push(entry.clone());
-                }
-            }
-        }
-
-        outputs
-    }
-
-    fn input_to_var(&mut self, tensor: &TensorDescription, elem: Elem) -> Variable {
-        let already_exists = self.tensors.contains_key(&tensor.id);
-
-        let variable = match already_exists {
-            false => {
-                // New input
-                let var = Variable::Input(self.inputs.len() as u16, Item::Scalar(elem));
-                self.inputs.push(tensor.clone());
-                var
-            }
-            true => match self.locals.get(&tensor.id) {
-                // Is a local variable.
-                Some(local_index) => Variable::Local(*local_index, Item::Scalar(elem)),
-                // Isn't a local variable, so must be an existing input.
-                None => {
-                    let input = self
-                        .inputs
-                        .iter()
-                        .enumerate()
-                        .find(|(_, input)| input.id == tensor.id)
-                        .unwrap();
-                    let input_index = input.0;
-                    Variable::Input(input_index as u16, Item::Scalar(elem))
-                }
-            },
-        };
-
-        // Update the tensor description with the new version.
-        self.tensors.insert(tensor.id, (tensor.clone(), elem));
-
-        variable
-    }
-
-    fn output_to_var(&mut self, tensor: &TensorDescription, elem: Elem) -> Variable {
-        // Update the tensor description to the new version.
-        self.tensors.insert(tensor.id, (tensor.clone(), elem));
-
-        // Output already registered as a local variable.
-        if let Some(index) = self.locals.get(&tensor.id) {
-            return Variable::Local(*index, Item::Scalar(elem));
-        }
-
-        // New local variable.
-        let local_index = self.locals.len() as u16;
-        self.locals.insert(tensor.id, local_index);
-        Variable::Local(local_index, Item::Scalar(elem))
-    }
-
    fn register_base<E: JitElement>(&mut self, ops: &BaseOperationDescription) -> bool {
        match ops {
            BaseOperationDescription::Equal(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Equal(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Equal(BinaryOperator { lhs, rhs, out }),
            ),
            _ => false,
        }
@ -410,47 +138,47 @@ impl<R: Runtime> ElementWiseBuilder<R> {
        match ops {
            FloatOperationDescription::Exp(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Exp(UnaryOperation { input, out })
+                    Operator::Exp(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Log(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Log(UnaryOperation { input, out })
+                    Operator::Log(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Log1p(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Log1p(UnaryOperation { input, out })
+                    Operator::Log1p(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Cos(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Cos(UnaryOperation { input, out })
+                    Operator::Cos(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Sin(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Sin(UnaryOperation { input, out })
+                    Operator::Sin(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::PowfScalar(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Powf(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Powf(BinaryOperator { lhs, rhs, out }),
            ),
            FloatOperationDescription::Tanh(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Tanh(UnaryOperation { input, out })
+                    Operator::Tanh(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Erf(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Erf(UnaryOperation { input, out })
+                    Operator::Erf(UnaryOperator { input, out })
                })
            }
            FloatOperationDescription::Recip(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Recip(UnaryOperation { input, out })
+                    Operator::Recip(UnaryOperator { input, out })
                })
            }
            _ => false,
@ -465,110 +193,111 @@ impl<R: Runtime> ElementWiseBuilder<R> {
            NumericOperationDescription::Add(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Add(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Add(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::AddScalar(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Add(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Add(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::Sub(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Sub(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Sub(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::SubScalar(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Sub(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Sub(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::Mul(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Mul(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Mul(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::MulScalar(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Mul(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Mul(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::Div(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Div(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Div(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::DivScalar(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), E::gpu_elem()),
-                |lhs, rhs, out| Operation::Div(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Div(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::Abs(desc) => {
                self.register_unary_ops(desc, (E::gpu_elem(), E::gpu_elem()), |input, out| {
-                    Operation::Abs(UnaryOperation { input, out })
+                    Operator::Abs(UnaryOperator { input, out })
                })
            }
            NumericOperationDescription::Lower(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Lower(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Lower(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::LowerElem(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Lower(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Lower(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::Greater(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Greater(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Greater(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::GreaterElem(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Greater(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Greater(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::LowerEqual(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::LowerEqual(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::LowerEqual(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::LowerEqualElem(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::LowerEqual(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::LowerEqual(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::GreaterEqual(desc) => self.register_binary_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::GreaterEqual(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::GreaterEqual(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::GreaterEqualElem(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::GreaterEqual(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::GreaterEqual(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::EqualElem(desc) => self.register_scalar_ops(
                desc,
                (E::gpu_elem(), E::gpu_elem(), Elem::Bool),
-                |lhs, rhs, out| Operation::Equal(BinaryOperation { lhs, rhs, out }),
+                |lhs, rhs, out| Operator::Equal(BinaryOperator { lhs, rhs, out }),
            ),
            NumericOperationDescription::MaskWhere(desc) => {
                if !self.output_is_compatible(&desc.out) {
                    return false;
                }

-                let cond = self.input_to_var(&desc.mask, Elem::Bool);
-                let lhs = self.input_to_var(&desc.value, E::gpu_elem());
-                let rhs = self.input_to_var(&desc.tensor, E::gpu_elem());
-                let out = self.output_to_var(&desc.out, E::gpu_elem());
+                let cond = self.builder.input(&desc.mask, Elem::Bool);
+                let lhs = self.builder.input(&desc.value, E::gpu_elem());
+                let rhs = self.builder.input(&desc.tensor, E::gpu_elem());
+                let out = self.builder.output(&desc.out, E::gpu_elem());

-                let ops = Operation::ConditionalAssign(ConditionalAssignOperation {
-                    cond,
-                    lhs,
-                    rhs,
-                    out,
-                });
-                self.operators.push(ops);
+                self.builder.register_operation(Operator::ConditionalAssign(
+                    ConditionalAssignOperator {
+                        cond,
+                        lhs,
+                        rhs,
+                        out,
+                    },
+                ));

                true
            }
@ -577,18 +306,19 @@ impl<R: Runtime> ElementWiseBuilder<R> {
                    return false;
                }

-                let cond = self.input_to_var(&desc.mask, Elem::Bool);
-                let lhs = self.scalar_to_var(&desc.value, E::gpu_elem());
-                let rhs = self.input_to_var(&desc.tensor, E::gpu_elem());
-                let out = self.output_to_var(&desc.out, E::gpu_elem());
+                let cond = self.builder.input(&desc.mask, Elem::Bool);
+                let lhs = self.builder.scalar(&desc.value, E::gpu_elem());
+                let rhs = self.builder.input(&desc.tensor, E::gpu_elem());
+                let out = self.builder.output(&desc.out, E::gpu_elem());

-                self.operators
-                    .push(Operation::ConditionalAssign(ConditionalAssignOperation {
+                self.builder.register_operation(Operator::ConditionalAssign(
+                    ConditionalAssignOperator {
                        cond,
                        lhs,
                        rhs,
                        out,
-                    }));
+                    },
+                ));

                true
            }
@ -597,11 +327,11 @@ impl<R: Runtime> ElementWiseBuilder<R> {
                    return false;
                }

-                let input = Variable::Constant(1.0, Item::Scalar(E::gpu_elem()));
-                let out = self.output_to_var(desc, E::gpu_elem());
+                let input = Variable::ConstantScalar(1.0, E::gpu_elem());
+                let out = self.builder.output(desc, E::gpu_elem());

-                self.operators
-                    .push(Operation::AssignLocal(UnaryOperation { input, out }));
+                self.builder
+                    .register_operation(Operator::AssignLocal(UnaryOperator { input, out }));

                true
            }
@ -610,11 +340,11 @@ impl<R: Runtime> ElementWiseBuilder<R> {
                    return false;
                }

-                let input = Variable::Constant(0.0, Item::Scalar(E::gpu_elem()));
-                let out = self.output_to_var(desc, E::gpu_elem());
+                let input = Variable::ConstantScalar(0.0, E::gpu_elem());
+                let out = self.builder.output(desc, E::gpu_elem());

-                self.operators
-                    .push(Operation::AssignLocal(UnaryOperation { input, out }));
+                self.builder
+                    .register_operation(Operator::AssignLocal(UnaryOperator { input, out }));

                true
            }
@ -623,11 +353,11 @@ impl<R: Runtime> ElementWiseBuilder<R> {
                    return false;
                }

-                let input = self.scalar_to_var(elem, E::gpu_elem());
-                let out = self.output_to_var(desc, E::gpu_elem());
+                let input = self.builder.scalar(elem, E::gpu_elem());
+                let out = self.builder.output(desc, E::gpu_elem());

-                self.operators
-                    .push(Operation::AssignLocal(UnaryOperation { input, out }));
+                self.builder
+                    .register_operation(Operator::AssignLocal(UnaryOperator { input, out }));

                true
            }
@ -642,17 +372,17 @@ impl<R: Runtime> ElementWiseBuilder<R> {
        func: Func,
    ) -> bool
    where
-        Func: Fn(Variable, Variable, Variable) -> Operation,
+        Func: Fn(Variable, Variable, Variable) -> Operator,
    {
        if !self.output_is_compatible(&desc.out) {
            return false;
        }

-        let lhs = self.input_to_var(&desc.lhs, elem_lhs);
-        let rhs = self.input_to_var(&desc.rhs, elem_rhs);
-        let out = self.output_to_var(&desc.out, elem_out);
+        let lhs = self.builder.input(&desc.lhs, elem_lhs);
+        let rhs = self.builder.input(&desc.rhs, elem_rhs);
+        let out = self.builder.output(&desc.out, elem_out);

-        self.operators.push(func(lhs, rhs, out));
+        self.builder.register_operation(func(lhs, rhs, out));

        true
    }
@ -664,16 +394,16 @@ impl<R: Runtime> ElementWiseBuilder<R> {
        func: Func,
    ) -> bool
    where
-        Func: Fn(Variable, Variable) -> Operation,
+        Func: Fn(Variable, Variable) -> Operator,
    {
        if !self.output_is_compatible(&desc.out) {
            return false;
        }

-        let input = self.input_to_var(&desc.input, elem_input);
-        let out = self.output_to_var(&desc.out, elem_out);
+        let input = self.builder.input(&desc.input, elem_input);
+        let out = self.builder.output(&desc.out, elem_out);

-        self.operators.push(func(input, out));
+        self.builder.register_operation(func(input, out));

        true
    }
@ -685,41 +415,21 @@ impl<R: Runtime> ElementWiseBuilder<R> {
        func: Func,
    ) -> bool
    where
-        Func: Fn(Variable, Variable, Variable) -> Operation,
+        Func: Fn(Variable, Variable, Variable) -> Operator,
    {
        if !self.output_is_compatible(&desc.out) {
            return false;
        }

-        let lhs = self.input_to_var(&desc.lhs, elem_lhs);
-        let rhs = self.scalar_to_var(&desc.rhs, elem_rhs);
-        let out = self.output_to_var(&desc.out, elem_out);
+        let lhs = self.builder.input(&desc.lhs, elem_lhs);
+        let rhs = self.builder.scalar(&desc.rhs, elem_rhs);
+        let out = self.builder.output(&desc.out, elem_out);

-        self.operators.push(func(lhs, rhs, out));
+        self.builder.register_operation(func(lhs, rhs, out));

        true
    }

-    fn scalar_to_var<E: Element>(&mut self, _value: &E, elem_type: Elem) -> Variable {
-        match elem_type {
-            Elem::Float => {
-                self.scalars_float += 1;
-                Variable::Scalar(self.scalars_float as u16 - 1, Item::Scalar(Elem::Float))
-            }
-            Elem::Int => {
-                self.scalars_int += 1;
-                Variable::Scalar(self.scalars_int as u16 - 1, Item::Scalar(Elem::Int))
-            }
-            Elem::UInt => {
-                self.scalars_uint += 1;
-                Variable::Scalar(self.scalars_uint as u16 - 1, Item::Scalar(Elem::UInt))
-            }
-            Elem::Bool => {
-                panic!("Bool scalars not supported")
-            }
-        }
-    }
-
    fn output_is_compatible(&mut self, out: &TensorDescription) -> bool {
        if self.current_output_shape.is_empty() {
            self.current_output_shape = out.shape.clone();
--- a/burn-wgpu/src/fusion/elemwise/kernel.rs
+++ b/burn-wgpu/src/fusion/elemwise/kernel.rs
@ -4,7 +4,7 @@ use crate::{
    fusion::{
        kernel::{FusionKernel, OutputInfo, Priority, SelectedKernel},
        source::GpuKernelSource,
-        WgpuFusionHandle,
+        JitFusionHandle,
    },
    kernel::elemwise_workgroup,
    Runtime,
@ -23,7 +23,7 @@ pub struct VecElementWise<R: Runtime> {
 impl<R: Runtime> FusionKernel<R> for ScalarElementWise<R> {
    fn kernel(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> SelectedKernel {
@ -32,7 +32,7 @@ impl<R: Runtime> FusionKernel<R> for ScalarElementWise<R> {

    fn priority(
        &self,
-        _handles_inputs: &[WgpuFusionHandle<R>],
+        _handles_inputs: &[JitFusionHandle<R>],
        _inputs: &[&TensorDescription],
        _outputs: &[&TensorDescription],
    ) -> Priority {
@ -43,7 +43,7 @@ impl<R: Runtime> FusionKernel<R> for ScalarElementWise<R> {
 impl<R: Runtime> FusionKernel<R> for VecElementWise<R> {
    fn kernel(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> SelectedKernel {
@ -52,11 +52,11 @@ impl<R: Runtime> FusionKernel<R> for VecElementWise<R> {

    fn priority(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        _outputs: &[&TensorDescription],
    ) -> Priority {
-        let is_unavailable_input = |handle: &WgpuFusionHandle<R>, desc: &TensorDescription| {
+        let is_unavailable_input = |handle: &JitFusionHandle<R>, desc: &TensorDescription| {
            let rank = handle.strides.len();

            // Last dimension strides should be 1, otherwise vecX won't be contiguous.
@ -96,7 +96,7 @@ impl<R: Runtime> FusionKernel<R> for VecElementWise<R> {
 impl<R: Runtime> ElementWiseSource<R> {
    fn kernel(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> SelectedKernel {
@ -110,7 +110,7 @@ impl<R: Runtime> ElementWiseSource<R> {

        match inplace_available(&self.mappings, handles_inputs) {
            true => {
-                let reference_tensor = inputs[self.mappings[0].position_input];
+                let reference_tensor = inputs[self.mappings[0].pos_input];
                let num_elems = calculate_num_elems_dyn_rank(&reference_tensor.shape);
                let workgroup = elemwise_workgroup(num_elems / self.factor, workgroup_size);
                let kernel = Box::new(DynamicKernel::new(self.source_inplace.clone(), workgroup));
@ -172,7 +172,7 @@ impl<R: Runtime> ElementWiseSource<R> {
        let mut inplace_output2input = vec![None; num_output];

        for mapping in mappings.iter() {
-            inplace_output2input[mapping.position_output] = Some(mapping.position_input);
+            inplace_output2input[mapping.pos_output] = Some(mapping.pos_input);
        }

        Self {
@ -214,14 +214,14 @@ impl<R: Runtime> VecElementWise<R> {

 fn inplace_available<R: Runtime>(
    mappings: &[InplaceMapping],
-    handles_inputs: &[WgpuFusionHandle<R>],
+    handles_inputs: &[JitFusionHandle<R>],
 ) -> bool {
    if mappings.is_empty() {
        return false;
    }

    for mapping in mappings.iter() {
-        let handle = &handles_inputs[mapping.position_input];
+        let handle = &handles_inputs[mapping.pos_input];

        if !handle.handle.can_mut() {
            return false;
--- a/burn-wgpu/src/fusion/elemwise/optimization.rs
+++ b/burn-wgpu/src/fusion/elemwise/optimization.rs
@ -4,163 +4,57 @@ use super::{
    FusionElemWiseAutotuneKey,
 };
 use crate::{
-    codegen::dialect::gpu::{Elem, Item, Operation, Vectorization, Visibility, WorkgroupSize},
-    codegen::{ElemWiseKernelCodegen, InplaceMapping, Input, Output, ReadingStrategy},
+    codegen::{
+        dialect::gpu::{Vectorization, WorkgroupSize},
+        Compilation, CompilationInfo, CompilationSettings,
+    },
    compute::JitAutotuneKey,
-    fusion::{kernel::FusionKernelSet, source::GpuKernelSource},
+    fusion::{kernel::FusionKernelSet, source::GpuKernelSource, tracing::Trace},
    JitBackend, Runtime,
 };
 use burn_common::id::IdGenerator;
 use burn_compute::client::ComputeClient;
-use burn_fusion::{stream::Context, TensorDescription};
+use burn_fusion::stream::Context;
 use serde::{Deserialize, Serialize};

 #[derive(new)]
 pub struct ElementWise<R: Runtime, Phase = ExecutionPhase<R>> {
-    pub(super) inputs: Vec<(TensorDescription, Elem)>,
-    pub(super) outputs: Vec<(TensorDescription, Elem)>,
-    pub(super) locals: Vec<u16>,
-    pub(super) scalars: Scalars,
-    pub(super) operators: Vec<Operation>,
+    pub(super) trace: Trace,
+    pub(super) num_operations: usize,
    pub(super) device: R::Device,
    pub(super) phase: Phase,
 }

-#[derive(new, Clone, Serialize, Deserialize)]
-pub struct Scalars {
-    pub(super) num_f32: usize,
-    pub(super) num_u32: usize,
-    pub(super) num_i32: usize,
-}
-
+/// Phase where the kernel should be compiled.
 pub struct CompilationPhase;

+/// Phase where the kernel should be executed.
 #[derive(new)]
 pub struct ExecutionPhase<R: Runtime> {
+    /// Kernel set with default workgroup size.
    pub(super) kernel_set_1: FusionKernelSet<R>,
+    /// Kernel set with custom workgroup size.
    pub(super) kernel_set_2: FusionKernelSet<R>,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(new, Serialize, Deserialize)]
 pub struct ElementWiseState {
-    inputs: Vec<(TensorDescription, Elem)>,
-    outputs: Vec<(TensorDescription, Elem)>,
-    scalars: Scalars,
-    operators: Vec<Operation>,
-    locals: Vec<u16>,
+    trace: Trace,
+    num_operations: usize,
 }

 impl<R: Runtime> ElementWise<R, CompilationPhase> {
    pub(crate) fn compile(self) -> ElementWise<R, ExecutionPhase<R>> {
-        let mut inputs = self
-            .inputs
-            .iter()
-            .map(|(_tensor, elem)| Input::Array {
-                item: Item::Scalar(*elem),
-                visibility: Visibility::Read,
-                strategy: ReadingStrategy::OutputLayout,
-            })
-            .collect::<Vec<_>>();
+        let info = self.trace.compiling();

-        let outputs = self
-            .outputs
-            .iter()
-            .zip(self.locals.iter())
-            .map(|((_tensor, elem), local)| Output::Array {
-                item: Item::Scalar(*elem),
-                local: *local,
-            })
-            .collect::<Vec<_>>();
-
-        if self.scalars.num_f32 > 0 {
-            inputs.push(Input::Scalar {
-                elem: Elem::Float,
-                size: self.scalars.num_f32,
-            })
-        }
-
-        if self.scalars.num_u32 > 0 {
-            inputs.push(Input::Scalar {
-                elem: Elem::UInt,
-                size: self.scalars.num_u32,
-            })
-        }
-
-        if self.scalars.num_i32 > 0 {
-            inputs.push(Input::Scalar {
-                elem: Elem::Int,
-                size: self.scalars.num_i32,
-            })
-        }
-
-        let mut potential_inplace = self
-            .inputs
-            .iter()
-            .zip(inputs.iter())
-            .enumerate()
-            .filter(|(_pos, ((desc, _elem), _input))| match desc.status {
-                burn_fusion::TensorStatus::ReadOnly => false,
-                burn_fusion::TensorStatus::ReadWrite => true,
-                burn_fusion::TensorStatus::NotInit => false,
-            })
-            .map(|(pos, ((desc, elem), input))| (pos, desc, elem, input))
-            .collect::<Vec<_>>();
-
-        let mappings = self
-            .outputs
-            .iter()
-            .zip(outputs.iter())
-            .enumerate()
-            .filter_map(|(pos, ((desc, elem), _output))| {
-                if potential_inplace.is_empty() {
-                    return None;
-                }
-
-                let mut chosen = None;
-                for (index, (_pos_input, desc_input, elem_input, _input)) in
-                    potential_inplace.iter().enumerate()
-                {
-                    if chosen.is_some() {
-                        break;
-                    }
-                    if desc.shape == desc_input.shape && *elem_input == elem {
-                        chosen = Some(index);
-                    }
-                }
-
-                match chosen {
-                    Some(index) => {
-                        let input = potential_inplace.remove(index);
-                        Some(InplaceMapping::new(input.0, pos))
-                    }
-                    None => None,
-                }
-            })
-            .collect::<Vec<_>>();
-
-        let kernel_set_1 = build_kernel_set::<R>(
-            &inputs,
-            &outputs,
-            &self.operators,
-            &mappings,
-            WorkgroupSize::default(),
-        );
-        let kernel_set_2 = build_kernel_set::<R>(
-            &inputs,
-            &outputs,
-            &self.operators,
-            &mappings,
-            WorkgroupSize::new(16, 16, 1),
-        );
+        let kernel_set_1 = build_kernel_set::<R>(&info, WorkgroupSize::default());
+        let kernel_set_2 = build_kernel_set::<R>(&info, WorkgroupSize::new(16, 16, 1));

        ElementWise {
-            inputs: self.inputs,
-            outputs: self.outputs,
-            scalars: self.scalars,
+            trace: self.trace,
            device: self.device,
-            operators: self.operators,
-            locals: self.locals,
            phase: ExecutionPhase::new(kernel_set_1, kernel_set_2),
+            num_operations: self.num_operations,
        }
    }
 }
@ -170,7 +64,7 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
        let client = R::client(&self.device);

        let key = JitAutotuneKey::FusionElemWise(FusionElemWiseAutotuneKey::new(
-            self.operators.len(),
+            self.num_operations,
            self.autotune_shape(context),
        ));

@ -187,22 +81,14 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
        client: ComputeClient<R::Server, R::Channel>,
        fastest_set_index: usize,
    ) {
+        let info = self.trace.running();
        let kernel_set = match fastest_set_index {
            0 => &self.phase.kernel_set_1,
            1 => &self.phase.kernel_set_2,
            _ => panic!("Should be 0 or 1, got {fastest_set_index}"),
        };

-        let kernel = kernel_set.select(
-            &self.inputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            &self.outputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            self.scalars.num_f32,
-            self.scalars.num_i32,
-            context,
-            self.device.clone(),
-            client,
-            true,
-        );
+        let kernel = kernel_set.select(&info, context, self.device.clone(), client, true);

        kernel.execute();
    }
@ -213,31 +99,24 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
        client: ComputeClient<R::Server, R::Channel>,
        key: JitAutotuneKey,
    ) {
+        let info = self.trace.running();
+
        let kernel_1 = self.phase.kernel_set_1.select(
-            &self.inputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            &self.outputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            self.scalars.num_f32,
-            self.scalars.num_i32,
+            &info,
            context,
            self.device.clone(),
            client.clone(),
            false, // Should not mutate the context.
        );
        let kernel_2 = self.phase.kernel_set_1.select(
-            &self.inputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            &self.outputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            self.scalars.num_f32,
-            self.scalars.num_i32,
+            &info,
            context,
            self.device.clone(),
            client.clone(),
            false, // Should not mutate the context.
        );
        let kernel_default = self.phase.kernel_set_1.select(
-            &self.inputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            &self.outputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
-            self.scalars.num_f32,
-            self.scalars.num_i32,
+            &info,
            context,
            self.device.clone(),
            client.clone(),
@ -253,7 +132,7 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
    }

    pub(crate) fn len(&self) -> usize {
-        self.operators.len()
+        self.num_operations
    }

    /// The first output is chosen when possible, otherwise the first input is chosen.
@ -261,13 +140,15 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
        &self,
        context: &mut Context<'a, JitBackend<R>>,
    ) -> &'a [usize] {
-        if let Some(tensor) = self.outputs.first() {
-            let tensor = context.tensors.get(&tensor.0.id).unwrap();
+        let info = self.trace.running();
+
+        if let Some(tensor) = info.outputs.first() {
+            let tensor = context.tensors.get(&tensor.id).unwrap();
            return &tensor.shape;
        }

-        if let Some(tensor) = self.inputs.first() {
-            let tensor = context.tensors.get(&tensor.0.id).unwrap();
+        if let Some(tensor) = info.inputs.first() {
+            let tensor = context.tensors.get(&tensor.id).unwrap();
            return &tensor.shape;
        }

@ -281,109 +162,86 @@ impl<R: Runtime> ElementWise<R, ExecutionPhase<R>> {
        // It is still unclear if the deserialization would be that much faster than
        // simply recompiling it.
        ElementWise {
-            inputs: state.inputs,
-            outputs: state.outputs,
-            scalars: state.scalars,
+            trace: state.trace,
            device: device.clone(),
-            locals: state.locals,
-            operators: state.operators,
            phase: CompilationPhase,
+            num_operations: state.num_operations,
        }
        .compile()
    }

    pub(crate) fn to_state(&self) -> ElementWiseState {
        ElementWiseState {
-            inputs: self.inputs.clone(),
-            outputs: self.outputs.clone(),
-            scalars: self.scalars.clone(),
-            operators: self.operators.clone(),
-            locals: self.locals.clone(),
+            trace: self.trace.clone(),
+            num_operations: self.num_operations,
        }
    }
 }

 fn build_kernel_set<R: Runtime>(
-    inputs: &[Input],
-    outputs: &[Output],
-    operators: &[Operation],
-    mappings: &[InplaceMapping],
+    info: &CompilationInfo,
    workgroup_size: WorkgroupSize,
 ) -> FusionKernelSet<R> {
    let scalar = ScalarElementWise::<R>::new(
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone())
+                .compile(CompilationSettings::default().workgroup_size(workgroup_size)),
        ),
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .inplace(mappings)
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone()).compile(
+                CompilationSettings::default()
+                    .inplace(true)
+                    .workgroup_size(workgroup_size),
+            ),
        ),
-        mappings.to_vec(),
-        outputs.len(),
+        info.mappings.to_vec(),
+        info.outputs.len(),
    );

    let vec2 = VecElementWise::<R>::new(
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .vectorize(Vectorization::Vec2)
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone()).compile(
+                CompilationSettings::default()
+                    .vectorize(Vectorization::Vec2)
+                    .workgroup_size(workgroup_size),
+            ),
        ),
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .vectorize(Vectorization::Vec2)
-                .inplace(mappings)
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone()).compile(
+                CompilationSettings::default()
+                    .inplace(true)
+                    .vectorize(Vectorization::Vec2)
+                    .workgroup_size(workgroup_size),
+            ),
        ),
-        mappings.to_vec(),
-        outputs.len(),
+        info.mappings.to_vec(),
+        info.outputs.len(),
        2,
    );
    let vec4 = VecElementWise::<R>::new(
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .vectorize(Vectorization::Vec4)
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone()).compile(
+                CompilationSettings::default()
+                    .vectorize(Vectorization::Vec4)
+                    .workgroup_size(workgroup_size),
+            ),
        ),
        GpuKernelSource::new(
            IdGenerator::generate(),
-            ElemWiseKernelCodegen::new()
-                .vectorize(Vectorization::Vec4)
-                .inplace(mappings)
-                .inputs(inputs)
-                .body(operators)
-                .outputs(outputs)
-                .workgroup_size(workgroup_size)
-                .compile(),
+            Compilation::new(info.clone()).compile(
+                CompilationSettings::default()
+                    .inplace(true)
+                    .vectorize(Vectorization::Vec4)
+                    .workgroup_size(workgroup_size),
+            ),
        ),
-        mappings.to_vec(),
-        outputs.len(),
+        info.mappings.to_vec(),
+        info.outputs.len(),
        4,
    );

--- a/burn-wgpu/src/fusion/kernel.rs
+++ b/burn-wgpu/src/fusion/kernel.rs
@ -1,6 +1,6 @@
 use crate::compute::Kernel;
 use crate::fusion::strides_dyn_rank;
-use crate::fusion::WgpuFusionHandle;
+use crate::fusion::JitFusionHandle;
 use crate::JitBackend;
 use crate::Runtime;
 use burn_compute::client::ComputeClient;
@ -11,6 +11,8 @@ use burn_fusion::{TensorDescription, TensorStatus};
 use burn_tensor::Device;
 use std::sync::Arc;

+use super::tracing::ExecutionInfo;
+
 /// Many kernels can be used for the same set of tensor operations fused into one.
 ///
 /// This type makes it easy to group those potential kernels and execute the best one depending on the context.
@ -104,14 +106,14 @@ pub trait FusionKernel<R: Runtime>: Send + Sync {
    /// Returns the priority of this kernel based on the input and output information.
    fn priority(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> Priority;
    /// Returns a [selected kernel](SelectedKernel) that can be executed by the compute server.
    fn kernel(
        &self,
-        handles_inputs: &[WgpuFusionHandle<R>],
+        handles_inputs: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> SelectedKernel;
@ -119,20 +121,21 @@ pub trait FusionKernel<R: Runtime>: Send + Sync {

 impl<R: Runtime> FusionKernelSet<R> {
    /// Select the best kernel based on the given information.
-    #[allow(clippy::too_many_arguments)]
    pub fn select(
        &self,
-        inputs: &[&TensorDescription],
-        outputs: &[&TensorDescription],
-        scalars_f32: usize,
-        scalars_i32: usize,
+        running_info: &ExecutionInfo<'_>,
        context: &mut Context<'_, JitBackend<R>>,
        device: Device<JitBackend<R>>,
        client: ComputeClient<R::Server, R::Channel>,
        stateful: bool,
    ) -> ExecutableKernel<R> {
        let (handles_input, inputs_description_updated, outputs_description_updated) =
-            process_inputs_outputs(inputs, outputs, context, stateful);
+            process_inputs_outputs(
+                &running_info.inputs,
+                &running_info.outputs,
+                context,
+                stateful,
+            );

        let selected = self.select_kernel(
            &handles_input,
@ -140,19 +143,27 @@ impl<R: Runtime> FusionKernelSet<R> {
            &outputs_description_updated,
        );

-        let rank_input = inputs.first().map(|desc| desc.shape.len()).unwrap_or(1);
-        let rank_output = outputs.first().map(|desc| desc.shape.len()).unwrap_or(1);
+        let rank_input = running_info
+            .inputs
+            .first()
+            .map(|desc| desc.shape.len())
+            .unwrap_or(1);
+        let rank_output = running_info
+            .outputs
+            .first()
+            .map(|desc| desc.shape.len())
+            .unwrap_or(1);
        let rank = usize::max(rank_input, rank_output);

-        let num_tensors = inputs.len() + outputs.len();
+        let num_tensors = running_info.inputs.len() + running_info.outputs.len();
        // The buffer starts with the rank, then each tensor shape and stride.
        let info_size = (num_tensors * rank * 2) + 1;

        let mut num_handles = num_tensors + 1;
-        if scalars_f32 > 0 {
+        if running_info.scalars.num_float > 0 {
            num_handles += 1;
        }
-        if scalars_i32 > 0 {
+        if running_info.scalars.num_int > 0 {
            num_handles += 1;
        }

@ -175,7 +186,7 @@ impl<R: Runtime> FusionKernelSet<R> {
                // Use the input inplace for this output.
                OutputInfo::Inplace { input_index } => {
                    let handle = handles.get(*input_index).unwrap().clone();
-                    let handle_fusion = WgpuFusionHandle {
+                    let handle_fusion = JitFusionHandle {
                        client: client.clone(),
                        device: device.clone(),
                        strides: strides_dyn_rank(&tensor.shape),
@ -185,7 +196,7 @@ impl<R: Runtime> FusionKernelSet<R> {
                }
                // Create a new buffer for this output.
                OutputInfo::Array { size } => {
-                    let handle_fusion = WgpuFusionHandle {
+                    let handle_fusion = JitFusionHandle {
                        client: client.clone(),
                        device: device.clone(),
                        strides: strides_dyn_rank(&tensor.shape),
@ -203,13 +214,16 @@ impl<R: Runtime> FusionKernelSet<R> {
        handles.push(client.create(bytemuck::cast_slice(&info)));

        // Finally we finish with the named bindings.
-        if scalars_f32 > 0 {
-            handles
-                .push(client.create(bytemuck::cast_slice(&context.scalar_floats[0..scalars_f32])));
+        if running_info.scalars.num_float > 0 {
+            handles.push(client.create(bytemuck::cast_slice(
+                &context.scalar_floats[0..running_info.scalars.num_float],
+            )));
        }

-        if scalars_i32 > 0 {
-            handles.push(client.create(bytemuck::cast_slice(&context.scalar_ints[0..scalars_i32])));
+        if running_info.scalars.num_int > 0 {
+            handles.push(client.create(bytemuck::cast_slice(
+                &context.scalar_ints[0..running_info.scalars.num_int],
+            )));
        }

        // We have to register the output handles to the context.
@ -222,7 +236,7 @@ impl<R: Runtime> FusionKernelSet<R> {

    fn select_kernel(
        &self,
-        handles_input: &[WgpuFusionHandle<R>],
+        handles_input: &[JitFusionHandle<R>],
        inputs: &[&TensorDescription],
        outputs: &[&TensorDescription],
    ) -> SelectedKernel {
@ -249,7 +263,7 @@ impl<R: Runtime> FusionKernelSet<R> {
 fn register_info_tensor<R: Runtime>(
    info: &mut Vec<u32>,
    tensor: &TensorDescription,
-    handle: &WgpuFusionHandle<R>,
+    handle: &JitFusionHandle<R>,
 ) {
    if info.is_empty() {
        info.push(handle.strides.len() as u32);
@ -269,7 +283,7 @@ fn process_inputs_outputs<'a, R: Runtime>(
    context: &'a mut Context<'_, JitBackend<R>>,
    stateful: bool,
 ) -> (
-    Vec<WgpuFusionHandle<R>>,
+    Vec<JitFusionHandle<R>>,
    Vec<&'a TensorDescription>,
    Vec<&'a TensorDescription>,
 ) {
--- a/burn-wgpu/src/fusion/mod.rs
+++ b/burn-wgpu/src/fusion/mod.rs
@ -3,6 +3,7 @@ mod elemwise;

 pub(crate) mod kernel;
 pub(crate) mod source;
+pub(crate) mod tracing;

 pub use base::*;
 pub(crate) use elemwise::*;
--- a/burn-wgpu/src/fusion/tracing/base.rs
+++ b/burn-wgpu/src/fusion/tracing/base.rs
@ -0,0 +1,8 @@
+use serde::{Deserialize, Serialize};
+#[derive(Default, Clone, Serialize, Deserialize)]
+pub struct Scalars {
+    pub(crate) num_float: usize,
+    pub(crate) num_int: usize,
+    pub(crate) num_uint: usize,
+    pub(crate) num_bool: usize,
+}
--- a/burn-wgpu/src/fusion/tracing/builder.rs
+++ b/burn-wgpu/src/fusion/tracing/builder.rs
@ -0,0 +1,352 @@
+use super::{trace::Trace, Scalars};
+use crate::codegen::dialect::gpu::{self, Operation, Variable};
+use burn_fusion::{TensorDescription, TensorId};
+use burn_tensor::Element;
+use hashbrown::HashMap;
+
+/// Type facilitating building a [trace](Trace) by doing most of the conversions between the
+/// operations provided in [burn_fusion] and the [gpu dialect](gpu).
+#[derive(Clone)]
+pub struct TraceBuilder {
+    // Input tensor descriptions with the variables created after reading from global memory.
+    inputs: Vec<(TensorDescription, Variable)>,
+    // Each output tensor id with the output variable index created by the operation.
+    output_to_local: HashMap<TensorId, u16>,
+    tensors: HashMap<TensorId, (TensorDescription, gpu::Elem)>,
+    scalars: Scalars,
+    scope: gpu::Scope,
+}
+
+impl TraceBuilder {
+    /// Create a new builder.
+    pub fn new() -> Self {
+        Self {
+            inputs: Vec::new(),
+            output_to_local: HashMap::new(),
+            tensors: HashMap::new(),
+            scalars: Scalars::default(),
+            scope: gpu::Scope::root(),
+        }
+    }
+
+    /// Register a [gpu operation](gpu::Operation).
+    pub fn register_operation<T: Into<gpu::Operation>>(&mut self, value: T) {
+        self.scope.register(value)
+    }
+
+    /// Create a variable from an input [tensor description](TensorDescription).
+    pub fn input(&mut self, tensor: &TensorDescription, elem: gpu::Elem) -> gpu::Variable {
+        let already_exists = self.tensors.contains_key(&tensor.id);
+
+        let variable = match already_exists {
+            false => {
+                // New input
+                let index = self.inputs.len() as u16;
+                let item = gpu::Item::Scalar(elem);
+
+                let local = self.scope.read_array(index, item);
+                self.inputs.push((tensor.clone(), local));
+                local
+            }
+            true => match self.output_to_local.get(&tensor.id) {
+                // Is a local variable.
+                Some(local_index) => {
+                    gpu::Variable::Local(*local_index, gpu::Item::Scalar(elem), self.scope.depth)
+                }
+                // Isn't an operation output variable, so must be an existing input.
+                None => self
+                    .inputs
+                    .iter()
+                    .find(|(input, _local)| input.id == tensor.id)
+                    .map(|(_, local)| *local)
+                    .unwrap(),
+            },
+        };
+
+        // Update the tensor description with the new version.
+        self.tensors.insert(tensor.id, (tensor.clone(), elem));
+
+        variable
+    }
+
+    /// Create a variable from an output [tensor description](TensorDescription).
+    pub fn output(&mut self, tensor: &TensorDescription, elem: gpu::Elem) -> gpu::Variable {
+        // Update the tensor description to the new version.
+        self.tensors.insert(tensor.id, (tensor.clone(), elem));
+
+        // Output already registered as a local variable.
+        if let Some(index) = self.output_to_local.get(&tensor.id) {
+            return gpu::Variable::Local(*index, gpu::Item::Scalar(elem), self.scope.depth);
+        }
+
+        let variable = self.scope.create_local(gpu::Item::Scalar(elem));
+        let local_index = variable.index().unwrap();
+        self.output_to_local.insert(tensor.id, local_index);
+        variable
+    }
+
+    /// Create a variable from an input [scalar](Element).
+    pub fn scalar<E: Element>(&mut self, _value: &E, elem_type: gpu::Elem) -> gpu::Variable {
+        match elem_type {
+            gpu::Elem::Float => {
+                let var = self
+                    .scope
+                    .read_scalar(self.scalars.num_float as u16, elem_type);
+                self.scalars.num_float += 1;
+                var
+            }
+            gpu::Elem::Int => {
+                let var = self
+                    .scope
+                    .read_scalar(self.scalars.num_int as u16, elem_type);
+                self.scalars.num_int += 1;
+                var
+            }
+            gpu::Elem::UInt => {
+                let var = self
+                    .scope
+                    .read_scalar(self.scalars.num_uint as u16, elem_type);
+                self.scalars.num_uint += 1;
+                var
+            }
+            gpu::Elem::Bool => {
+                let var = self
+                    .scope
+                    .read_scalar(self.scalars.num_bool as u16, elem_type);
+                self.scalars.num_bool += 1;
+                var
+            }
+        }
+    }
+
+    /// Build the [trace](Trace).
+    pub fn build(self) -> Trace {
+        let inputs = self.input_descriptions();
+        let outputs = self.output_descriptions();
+        let locals = outputs
+            .iter()
+            .map(|out| *self.output_to_local.get(&out.0.id).unwrap())
+            .collect::<Vec<_>>();
+
+        Trace::new(inputs, outputs, locals, self.scalars, self.scope)
+    }
+
+    fn input_descriptions(&self) -> Vec<(TensorDescription, gpu::Elem)> {
+        self.inputs
+            .iter()
+            .map(|(input, _local)| {
+                let updated_tensor = self.tensors.get(&input.id).unwrap();
+                updated_tensor.clone()
+            })
+            .collect::<Vec<_>>()
+    }
+
+    fn output_descriptions(&self) -> Vec<(TensorDescription, gpu::Elem)> {
+        let mut outputs = Vec::new();
+        let mut local_tensor_ids_input = Vec::new();
+        let mut local_tensor_ids_output = Vec::new();
+
+        // Mark a variable to the provided list of tensor ids using the variable list.
+        //
+        // Only local variables can become outputs.
+        let mark = |var: &gpu::Variable, list: &mut Vec<TensorId>| {
+            if let gpu::Variable::Local(index, _, _) = var {
+                if let Some((id, _)) = self
+                    .output_to_local
+                    .iter()
+                    .find(|(_id, position)| *position == index)
+                {
+                    if !list.contains(id) {
+                        list.push(*id);
+                    }
+                }
+            }
+        };
+        let mark_binary =
+            |op: &gpu::BinaryOperator, inputs: &mut Vec<TensorId>, outputs: &mut Vec<TensorId>| {
+                mark(&op.lhs, inputs);
+                mark(&op.rhs, inputs);
+                mark(&op.out, outputs);
+            };
+        let mark_unary =
+            |op: &gpu::UnaryOperator, inputs: &mut Vec<TensorId>, outputs: &mut Vec<TensorId>| {
+                mark(&op.input, inputs);
+                mark(&op.out, outputs);
+            };
+
+        // For all operators, mark their local tensor id in the proper set.
+        for op in self.scope.operations.iter() {
+            match op {
+                Operation::Operator(op) => {
+                    match op {
+                        gpu::Operator::AssignGlobal(_) => {
+                            // Nothing to do here.
+                        }
+                        gpu::Operator::AssignLocal(op) => {
+                            mark(&op.out, &mut local_tensor_ids_output);
+                        }
+                        gpu::Operator::Add(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Index(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Sub(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Mul(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Div(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Exp(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Abs(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Erf(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Log(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Log1p(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Cos(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Sin(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Tanh(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Clamp(op) => {
+                            mark(&op.input, &mut local_tensor_ids_input);
+                            mark(&op.out, &mut local_tensor_ids_output);
+                        }
+                        gpu::Operator::Powf(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Recip(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Lower(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Greater(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::LowerEqual(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::GreaterEqual(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Equal(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::ConditionalAssign(op) => {
+                            mark(&op.cond, &mut local_tensor_ids_input);
+                            mark(&op.lhs, &mut local_tensor_ids_input);
+                            mark(&op.rhs, &mut local_tensor_ids_input);
+                            mark(&op.out, &mut local_tensor_ids_output);
+                        }
+                        gpu::Operator::Sqrt(op) => mark_unary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                        gpu::Operator::Modulo(op) => mark_binary(
+                            op,
+                            &mut local_tensor_ids_input,
+                            &mut local_tensor_ids_output,
+                        ),
+                    }
+                }
+                Operation::Algorithm(algo) => {
+                    match algo {
+                        gpu::Algorithm::ReadGlobalWithLayout(_) => {
+                            // Nothing to do here.
+                        }
+                        gpu::Algorithm::ReadGlobal(_) => {
+                            // Nothing to do here.
+                        }
+                    }
+                }
+                Operation::Metadata(_) => {
+                    // Nothing to do, should never impact read-write access to bindings.
+                }
+                Operation::Loop(_) => {
+                    // Nothing to do, should never impact read-write access to bindings.
+                }
+            }
+        }
+
+        // All output tensors that are never read by a following operation should be written to
+        // since they are essentially the "logical" output of the shader.
+        for out in local_tensor_ids_output {
+            let is_read = local_tensor_ids_input.contains(&out);
+
+            if !is_read {
+                outputs.push(self.tensors.get(&out).unwrap().clone());
+            }
+        }
+
+        // All tensors where their latest description is read only should be written to since they
+        // are going to be used after the fused kernel by other operations.
+        for entry in self.tensors.values() {
+            let (tensor, _) = &entry;
+            if let burn_fusion::TensorStatus::ReadOnly = tensor.status {
+                if self.output_to_local.contains_key(&tensor.id) {
+                    outputs.push(entry.clone());
+                }
+            }
+        }
+
+        outputs
+    }
+}
--- a/burn-wgpu/src/fusion/tracing/mod.rs
+++ b/burn-wgpu/src/fusion/tracing/mod.rs
@ -0,0 +1,7 @@
+mod base;
+mod builder;
+mod trace;
+
+pub use base::*;
+pub use builder::*;
+pub use trace::*;
--- a/burn-wgpu/src/fusion/tracing/trace.rs
+++ b/burn-wgpu/src/fusion/tracing/trace.rs
@ -0,0 +1,133 @@
+use super::Scalars;
+use crate::codegen::{dialect::gpu, CompilationInfo, InplaceMapping, InputInfo, OutputInfo};
+use burn_fusion::TensorDescription;
+use serde::{Deserialize, Serialize};
+
+/// A trace encaptulates all information necessary to perform the compilation and execution of
+/// captured [tensor operations](burn_fusion::stream::OperationDescription).
+///
+/// A trace should be built using a [builder](super::TraceBuilder).
+#[derive(new, Clone, Serialize, Deserialize)]
+pub struct Trace {
+    inputs: Vec<(TensorDescription, gpu::Elem)>,
+    outputs: Vec<(TensorDescription, gpu::Elem)>,
+    locals: Vec<u16>,
+    scalars: Scalars,
+    scope: gpu::Scope,
+}
+
+/// Information necessary to execute a kernel.
+pub struct ExecutionInfo<'a> {
+    /// Tensor inputs.
+    pub inputs: Vec<&'a TensorDescription>,
+    /// Tensor outputs.
+    pub outputs: Vec<&'a TensorDescription>,
+    /// Scalar inputs.
+    pub scalars: &'a Scalars,
+}
+
+impl Trace {
+    /// Collect information related to running the trace.
+    pub fn running(&self) -> ExecutionInfo<'_> {
+        ExecutionInfo {
+            inputs: self.inputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
+            outputs: self.outputs.iter().map(|a| &a.0).collect::<Vec<_>>(),
+            scalars: &self.scalars,
+        }
+    }
+
+    /// Collect information related to compiling the trace.
+    pub fn compiling(&self) -> CompilationInfo {
+        let mut inputs = self
+            .inputs
+            .iter()
+            .map(|(_tensor, elem)| InputInfo::Array {
+                item: gpu::Item::Scalar(*elem),
+                visibility: gpu::Visibility::Read,
+            })
+            .collect::<Vec<_>>();
+
+        let outputs = self
+            .outputs
+            .iter()
+            .zip(self.locals.iter())
+            .map(|((_tensor, elem), local)| OutputInfo::Array {
+                item: gpu::Item::Scalar(*elem),
+                local: *local,
+            })
+            .collect::<Vec<_>>();
+
+        if self.scalars.num_float > 0 {
+            inputs.push(InputInfo::Scalar {
+                elem: gpu::Elem::Float,
+                size: self.scalars.num_float,
+            })
+        }
+
+        if self.scalars.num_uint > 0 {
+            inputs.push(InputInfo::Scalar {
+                elem: gpu::Elem::UInt,
+                size: self.scalars.num_uint,
+            })
+        }
+
+        if self.scalars.num_int > 0 {
+            inputs.push(InputInfo::Scalar {
+                elem: gpu::Elem::Int,
+                size: self.scalars.num_int,
+            })
+        }
+
+        let mut potential_inplace = self
+            .inputs
+            .iter()
+            .zip(inputs.iter())
+            .enumerate()
+            .filter(|(_pos, ((desc, _elem), _input))| match desc.status {
+                burn_fusion::TensorStatus::ReadOnly => false,
+                burn_fusion::TensorStatus::ReadWrite => true,
+                burn_fusion::TensorStatus::NotInit => false,
+            })
+            .map(|(pos, ((desc, elem), input))| (pos, desc, elem, input))
+            .collect::<Vec<_>>();
+
+        let mappings = self
+            .outputs
+            .iter()
+            .zip(outputs.iter())
+            .enumerate()
+            .filter_map(|(pos, ((desc, elem), _output))| {
+                if potential_inplace.is_empty() {
+                    return None;
+                }
+
+                let mut chosen = None;
+                for (index, (_pos_input, desc_input, elem_input, _input)) in
+                    potential_inplace.iter().enumerate()
+                {
+                    if chosen.is_some() {
+                        break;
+                    }
+                    if desc.shape == desc_input.shape && *elem_input == elem {
+                        chosen = Some(index);
+                    }
+                }
+
+                match chosen {
+                    Some(index) => {
+                        let input = potential_inplace.remove(index);
+                        Some(InplaceMapping::new(input.0, pos))
+                    }
+                    None => None,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        CompilationInfo {
+            inputs,
+            outputs,
+            scope: self.scope.clone(),
+            mappings,
+        }
+    }
+}
--- a/burn-wgpu/src/kernel/binary.rs
+++ b/burn-wgpu/src/kernel/binary.rs
@ -49,6 +49,46 @@ macro_rules! binary {
            _o: core::marker::PhantomData<O>,
        }

+        #[allow(clippy::redundant_closure_call)]
+        fn compile<C, I, O>(
+            settings: $crate::codegen::CompilationSettings,
+            mappings: Vec<$crate::codegen::InplaceMapping>,
+        ) -> $crate::kernel::SourceTemplate
+        where
+            C: $crate::codegen::Compiler,
+            I: $crate::element::JitElement,
+            O: $crate::element::JitElement
+        {
+            let mut scope = $crate::codegen::dialect::gpu::Scope::root();
+            let op = $ops(&mut scope, I::gpu_elem());
+            scope.register(op);
+
+            let local = scope.last_local_index().unwrap().index().unwrap();
+
+            let lhs = $crate::codegen::InputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
+                visibility: $crate::codegen::dialect::gpu::Visibility::Read,
+            };
+            let rhs = $crate::codegen::InputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
+                visibility: $crate::codegen::dialect::gpu::Visibility::Read,
+            };
+            let out = $crate::codegen::OutputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(O::gpu_elem()),
+                local,
+            };
+            let info = $crate::codegen::CompilationInfo {
+                inputs: vec![lhs, rhs],
+                outputs: vec![out],
+                scope,
+                mappings,
+            };
+            let shader = $crate::codegen::Compilation::new(info).compile(settings);
+
+            let compiled = C::compile(shader);
+            $crate::kernel::SourceTemplate::new(compiled.to_string())
+        }
+
        #[allow(clippy::redundant_closure_call)]
        impl<C, I, O> $crate::kernel::StaticKernelSource for Ops<C, I, O>
        where
@ -57,28 +97,8 @@ macro_rules! binary {
            O: $crate::element::JitElement
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                            strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                        },
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                            strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                        },
-                    ])
-                    .body(&[$ops(I::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Array {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(O::gpu_elem()),
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default();
+                compile::<C, I, O>(settings, Vec::new())
            }
        }

@ -91,29 +111,13 @@ macro_rules! binary {
            O: $crate::element::JitElement
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::ReadWrite,
-                            strategy: $crate::codegen::ReadingStrategy::Plain,
-                        },
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                            strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                        },
-                    ])
-                    .body(&[$ops(I::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Input {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                        input: 0,
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default()
+                    .inplace(true);
+                let mapping = $crate::codegen::InplaceMapping {
+                    pos_input: 0,
+                    pos_output: 0,
+                };
+                compile::<C, I, O>(settings, vec![mapping])
            }
        }

@ -126,29 +130,13 @@ macro_rules! binary {
            O: $crate::element::JitElement
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                            strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                        },
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::ReadWrite,
-                            strategy: $crate::codegen::ReadingStrategy::Plain,
-                        },
-                    ])
-                    .body(&[$ops(I::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Input {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(I::gpu_elem()),
-                        input: 1,
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default()
+                    .inplace(true);
+                let mapping = $crate::codegen::InplaceMapping {
+                    pos_input: 1,
+                    pos_output: 0,
+                };
+                compile::<C, I, O>(settings, vec![mapping])
            }
        }
    };
--- a/burn-wgpu/src/kernel/clamp.rs
+++ b/burn-wgpu/src/kernel/clamp.rs
@ -1,6 +1,6 @@
 use super::unary;
 use crate::{
-    codegen::dialect::gpu::{ClampOperation, Item, Operation, Variable},
+    codegen::dialect::gpu::{ClampOperator, Operator, Scope},
    element::JitElement,
    tensor::JitTensor,
    unary, Runtime,
@ -12,11 +12,11 @@ pub(crate) fn clamp<R: Runtime, E: JitElement, const D: usize>(
    max_value: E,
 ) -> JitTensor<R, E, D> {
    unary!(
-        operation: |elem| Operation::Clamp(ClampOperation {
-            input: Variable::Input(0, Item::Scalar(elem)),
-            min_value: Variable::Scalar(0, Item::Scalar(elem)),
-            max_value: Variable::Scalar(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem| Operator::Clamp(ClampOperator {
+            input: scope.read_array(0, elem),
+            min_value: scope.read_scalar(0, elem),
+            max_value: scope.read_scalar(1, elem),
+            out: scope.create_local(elem),
        }),
        compiler: R::Compiler,
        scalar 2
--- a/burn-wgpu/src/kernel/comparison.rs
+++ b/burn-wgpu/src/kernel/comparison.rs
@ -1,6 +1,6 @@
 use crate::{
    binary,
-    codegen::dialect::gpu::{BinaryOperation, Elem, Item, Operation, Variable},
+    codegen::dialect::gpu::{BinaryOperator, Elem, Operator, Scope},
    element::JitElement,
    kernel::StaticKernelSource,
    kernel::{binary::binary, unary::unary},
@ -51,10 +51,10 @@ pub fn equal<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        binary: |elem: Elem| Operation::Equal(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        binary: |scope: &mut Scope, elem: Elem| Operator::Equal(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -67,10 +67,10 @@ pub fn greater<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        binary: |elem: Elem| Operation::Greater(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        binary: |scope: &mut Scope, elem: Elem| Operator::Greater(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -83,10 +83,10 @@ pub fn greater_equal<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        binary: |elem: Elem| Operation::GreaterEqual(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        binary: |scope: &mut Scope, elem: Elem| Operator::GreaterEqual(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -99,10 +99,10 @@ pub fn lower<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        binary: |elem: Elem| Operation::Lower(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        binary: |scope: &mut Scope, elem: Elem| Operator::Lower(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -115,10 +115,10 @@ pub fn lower_equal<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        binary: |elem: Elem| Operation::LowerEqual(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        binary: |scope: &mut Scope, elem: Elem| Operator::LowerEqual(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -131,10 +131,10 @@ pub fn equal_elem<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        unary: |elem: Elem| Operation::Equal(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        unary: |scope: &mut Scope, elem: Elem| Operator::Equal(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -147,10 +147,10 @@ pub fn greater_elem<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        unary: |elem: Elem| Operation::Greater(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        unary: |scope: &mut Scope, elem: Elem| Operator::Greater(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -163,10 +163,10 @@ pub fn lower_elem<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        unary: |elem: Elem| Operation::Lower(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        unary: |scope: &mut Scope, elem: Elem| Operator::Lower(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -179,10 +179,10 @@ pub fn greater_equal_elem<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        unary: |elem: Elem| Operation::GreaterEqual(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        unary: |scope: &mut Scope, elem: Elem| Operator::GreaterEqual(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
@ -195,10 +195,10 @@ pub fn lower_equal_elem<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, u32, D> {
    comparison!(
-        unary: |elem: Elem| Operation::LowerEqual(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(Elem::Bool)),
+        unary: |scope: &mut Scope, elem: Elem| Operator::LowerEqual(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(Elem::Bool),
        }),
        runtime: R,
        input: lhs; rhs,
--- a/burn-wgpu/src/kernel/unary.rs
+++ b/burn-wgpu/src/kernel/unary.rs
@ -55,6 +55,42 @@ macro_rules! unary {
            _e: core::marker::PhantomData<E>,
        }

+        #[allow(clippy::redundant_closure_call)]
+        fn compile<C, E>(
+            settings: $crate::codegen::CompilationSettings,
+            mappings: Vec<$crate::codegen::InplaceMapping>,
+        ) -> $crate::kernel::SourceTemplate
+        where
+            C: $crate::codegen::Compiler,
+            E: $crate::element::JitElement
+        {
+
+            let mut scope = $crate::codegen::dialect::gpu::Scope::root();
+            let op = $ops(&mut scope, E::gpu_elem());
+            scope.register(op);
+
+            let local = scope.last_local_index().unwrap().index().unwrap();
+
+            let input = $crate::codegen::InputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
+                visibility: $crate::codegen::dialect::gpu::Visibility::Read,
+            };
+            let out = $crate::codegen::OutputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
+                local,
+            };
+            let info = $crate::codegen::CompilationInfo {
+                inputs: vec![input],
+                outputs: vec![out],
+                scope,
+                mappings,
+            };
+            let shader = $crate::codegen::Compilation::new(info).compile(settings);
+
+            let compiled = C::compile(shader);
+            $crate::kernel::SourceTemplate::new(compiled.to_string())
+        }
+
        #[allow(clippy::redundant_closure_call)]
        impl<C, E> $crate::kernel::StaticKernelSource for Ops<C, E>
        where
@ -62,21 +98,8 @@ macro_rules! unary {
            E: $crate::element::JitElement,
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[$crate::codegen::Input::Array {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                        strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                    }])
-                    .body(&[$ops(E::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Array {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default();
+                compile::<C, E>(settings, Vec::new())
            }
        }

@ -87,22 +110,13 @@ macro_rules! unary {
            E: $crate::element::JitElement,
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[$crate::codegen::Input::Array {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        visibility: $crate::codegen::dialect::gpu::Visibility::ReadWrite,
-                        strategy: $crate::codegen::ReadingStrategy::Plain,
-                    }])
-                    .body(&[$ops(E::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Input {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        input: 0,
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default()
+                    .inplace(true);
+                let mapping = $crate::codegen::InplaceMapping {
+                    pos_input: 0,
+                    pos_output: 0,
+                };
+                compile::<C, E>(settings, vec![mapping])
            }
        }
    };
@ -120,6 +134,46 @@ macro_rules! unary {
            _e: core::marker::PhantomData<E>,
        }

+        #[allow(clippy::redundant_closure_call)]
+        fn compile<C, E>(
+            settings: $crate::codegen::CompilationSettings,
+            mappings: Vec<$crate::codegen::InplaceMapping>,
+        ) -> $crate::kernel::SourceTemplate
+        where
+            C: $crate::codegen::Compiler,
+            E: $crate::element::JitElement
+        {
+
+            let mut scope = $crate::codegen::dialect::gpu::Scope::root();
+            let op = $ops(&mut scope, E::gpu_elem());
+            scope.register(op);
+
+            let local = scope.last_local_index().unwrap().index().unwrap();
+
+            let input = $crate::codegen::InputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
+                visibility: $crate::codegen::dialect::gpu::Visibility::Read,
+            };
+            let scalars = $crate::codegen::InputInfo::Scalar {
+                elem: E::gpu_elem(),
+                size: $num,
+            };
+            let out = $crate::codegen::OutputInfo::Array {
+                item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
+                local,
+            };
+            let info = $crate::codegen::CompilationInfo {
+                inputs: vec![input, scalars],
+                outputs: vec![out],
+                scope,
+                mappings,
+            };
+            let shader = $crate::codegen::Compilation::new(info).compile(settings);
+
+            let compiled = C::compile(shader);
+            $crate::kernel::SourceTemplate::new(compiled.to_string())
+        }
+
        #[allow(clippy::redundant_closure_call)]
        impl<C, E> $crate::kernel::StaticKernelSource for Ops<C, E>
        where
@ -127,27 +181,8 @@ macro_rules! unary {
            E: $crate::element::JitElement,
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::Read,
-                            strategy: $crate::codegen::ReadingStrategy::OutputLayout,
-                        },
-                        $crate::codegen::Input::Scalar {
-                            elem: E::gpu_elem(),
-                            size: $num,
-                        },
-                    ])
-                    .body(&[$ops(E::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Array {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default();
+                compile::<C, E>(settings, Vec::new())
            }
        }

@ -158,28 +193,13 @@ macro_rules! unary {
            E: $crate::element::JitElement,
        {
            fn source() -> $crate::kernel::SourceTemplate {
-                let shader = $crate::codegen::ElemWiseKernelCodegen::new()
-                    .inputs(&[
-                        $crate::codegen::Input::Array {
-                            item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                            visibility: $crate::codegen::dialect::gpu::Visibility::ReadWrite,
-                            strategy: $crate::codegen::ReadingStrategy::Plain,
-                        },
-                        $crate::codegen::Input::Scalar {
-                            elem: E::gpu_elem(),
-                            size: $num,
-                        },
-                    ])
-                    .body(&[$ops(E::gpu_elem())])
-                    .outputs(&[$crate::codegen::Output::Input {
-                        item: $crate::codegen::dialect::gpu::Item::Scalar(E::gpu_elem()),
-                        input: 0,
-                        local: 0,
-                    }])
-                    .compile();
-
-                let compiled = C::compile(shader);
-                $crate::kernel::SourceTemplate::new(compiled.to_string())
+                let settings = $crate::codegen::CompilationSettings::default()
+                    .inplace(true);
+                let mapping = $crate::codegen::InplaceMapping {
+                    pos_input: 0,
+                    pos_output: 0,
+                };
+                compile::<C, E>(settings, vec![mapping])
            }
        }
    };
@ -243,14 +263,14 @@ where
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::codegen::dialect::gpu::{Item, Operation, UnaryOperation, Variable};
+    use crate::codegen::dialect::gpu::{Operator, Scope, UnaryOperator};
    use crate::tests::{ReferenceBackend, TestBackend, TestCompiler, TestRuntime};
    use burn_tensor::{Distribution, Tensor};

    unary!(
-        operation: |elem| Operation::Tanh(UnaryOperation {
-            input: Variable::Input(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem| Operator::Tanh(UnaryOperator {
+            input: scope.read_array(0, elem),
+            out: scope.create_local(elem),
        }),
        compiler: TestCompiler
    );
--- a/burn-wgpu/src/ops/float_ops.rs
+++ b/burn-wgpu/src/ops/float_ops.rs
@ -1,7 +1,5 @@
 use super::numeric;
-use crate::codegen::dialect::gpu::{
-    BinaryOperation, Elem, Item, Operation, UnaryOperation, Variable,
-};
+use crate::codegen::dialect::gpu::{BinaryOperator, Elem, Operator, Scope, UnaryOperator};
 #[cfg(not(feature = "autotune"))]
 use crate::kernel::matmul::init_matmul_output;
 #[cfg(feature = "autotune")]
@ -364,9 +362,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_exp<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Exp(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Exp(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -376,9 +374,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_log<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Log(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Log(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -388,9 +386,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_log1p<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Log1p(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Log1p(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -403,10 +401,10 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {
        rhs: f32,
    ) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Powf(BinaryOperation {
-                lhs: Variable::Input(0, Item::Scalar(elem)),
-                rhs: Variable::Scalar(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Powf(BinaryOperator {
+                lhs: scope.read_array(0, elem),
+                rhs: scope.read_scalar(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: lhs; rhs.elem(),
@ -416,9 +414,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_sqrt<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Sqrt(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Sqrt(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -428,9 +426,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_abs<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Abs(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Abs(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -440,9 +438,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_cos<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Cos(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Cos(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -452,9 +450,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_sin<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Sin(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Sin(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -464,9 +462,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_tanh<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Tanh(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Tanh(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -476,9 +474,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_erf<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Erf(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Erf(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
@ -521,9 +519,9 @@ impl<R: Runtime> FloatTensorOps<Self> for JitBackend<R> {

    fn float_recip<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Recip(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Recip(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
--- a/burn-wgpu/src/ops/int_ops.rs
+++ b/burn-wgpu/src/ops/int_ops.rs
@ -1,5 +1,5 @@
 use super::numeric;
-use crate::codegen::dialect::gpu::{Elem, Item, Operation, UnaryOperation, Variable};
+use crate::codegen::dialect::gpu::{Elem, Item, Operator, Scope, UnaryOperator};
 use crate::kernel::reduce::{self, init_reduce_output};
 use crate::{kernel, unary, JitBackend, Runtime};
 use burn_tensor::ops::{BoolTensor, Device, FloatTensor, IntElem, IntTensor};
@ -281,9 +281,9 @@ impl<R: Runtime> IntTensorOps<Self> for JitBackend<R> {

    fn int_abs<const D: usize>(tensor: IntTensor<Self, D>) -> IntTensor<Self, D> {
        unary!(
-            operation: |elem: Elem| Operation::Abs(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::Abs(UnaryOperator {
+                input: scope.read_array(0, Item::Scalar(elem)),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: tensor,
--- a/burn-wgpu/src/ops/numeric.rs
+++ b/burn-wgpu/src/ops/numeric.rs
@ -1,6 +1,4 @@
-use crate::codegen::dialect::gpu::{
-    BinaryOperation, Elem, Item, Operation, UnaryOperation, Variable,
-};
+use crate::codegen::dialect::gpu::{BinaryOperator, Elem, Operator, Scope, UnaryOperator};
 use crate::{binary, Runtime};
 use crate::{element::JitElement, tensor::JitTensor, unary};
 use burn_compute::client::ComputeClient;
@ -25,9 +23,9 @@ pub fn full_device<R: Runtime, E: JitElement, const D: usize>(
    let empty = empty_device(client, device, shape);

    unary!(
-        operation: |elem: Elem| Operation::AssignLocal(UnaryOperation {
-            input: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::AssignLocal(UnaryOperator {
+            input: scope.read_scalar(0, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: empty; value,
@ -84,10 +82,10 @@ pub fn add<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, E, D> {
    binary!(
-        operation: |elem: Elem| Operation::Add(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Add(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -100,10 +98,10 @@ pub fn add_scalar<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, E, D> {
    unary!(
-        operation: |elem: Elem| Operation::Add(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Add(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -116,10 +114,10 @@ pub fn sub<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, E, D> {
    binary!(
-        operation: |elem: Elem| Operation::Sub(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Sub(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -132,10 +130,10 @@ pub fn sub_scalar<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, E, D> {
    unary!(
-        operation: |elem: Elem| Operation::Sub(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Sub(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -148,10 +146,10 @@ pub fn mul<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, E, D> {
    binary!(
-        operation: |elem: Elem| Operation::Mul(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Mul(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -164,10 +162,10 @@ pub fn mul_scalar<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, E, D> {
    unary!(
-        operation: |elem: Elem| Operation::Mul(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Mul(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -180,10 +178,10 @@ pub fn div<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, E, D> {
    binary!(
-        operation: |elem: Elem| Operation::Div(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Div(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -196,10 +194,10 @@ pub fn div_scalar<R: Runtime, E: JitElement, const D: usize>(
    rhs: E,
 ) -> JitTensor<R, E, D> {
    unary!(
-        operation: |elem: Elem| Operation::Div(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Scalar(0, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Div(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_scalar(0, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
@ -212,10 +210,10 @@ pub fn pow<R: Runtime, E: JitElement, const D: usize>(
    rhs: JitTensor<R, E, D>,
 ) -> JitTensor<R, E, D> {
    binary!(
-        operation: |elem: Elem| Operation::Powf(BinaryOperation {
-            lhs: Variable::Input(0, Item::Scalar(elem)),
-            rhs: Variable::Input(1, Item::Scalar(elem)),
-            out: Variable::Local(0, Item::Scalar(elem)),
+        operation: |scope: &mut Scope, elem: Elem| Operator::Powf(BinaryOperator {
+            lhs: scope.read_array(0, elem),
+            rhs: scope.read_array(1, elem),
+            out: scope.create_local(elem),
        }),
        runtime: R,
        input: lhs; rhs,
--- a/burn-wgpu/src/tensor/base.rs
+++ b/burn-wgpu/src/tensor/base.rs
@ -1,4 +1,4 @@
-use crate::codegen::dialect::gpu::{Elem, Item, Operation, UnaryOperation, Variable};
+use crate::codegen::dialect::gpu::{Elem, Operator, Scope, UnaryOperator};
 use crate::element::JitElement;
 use crate::{unary, Runtime};
 use burn_compute::client::ComputeClient;
@ -145,9 +145,9 @@ where
        //
        // The solution is just to use a simple unary compute shader.
        unary!(
-            operation: |elem: Elem| Operation::AssignLocal(UnaryOperation {
-                input: Variable::Input(0, Item::Scalar(elem)),
-                out: Variable::Local(0, Item::Scalar(elem)),
+            operation: |scope: &mut Scope, elem: Elem| Operator::AssignLocal(UnaryOperator {
+                input: scope.read_array(0, elem),
+                out: scope.create_local(elem),
            }),
            runtime: R,
            input: self.clone(),