Feat/wgpu backend setup (#376)

2023-06-02 11:52:47 -04:00 · 2023-06-02 11:52:47 -04:00 · 974fdfaba1
parent 483f9acca5
commit 974fdfaba1
40 changed files with 2485 additions and 24 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -14,6 +14,7 @@ members = [
  "burn-ndarray",
  "burn-no-std-tests",
  "burn-tch",
+  "burn-wgpu",
  "burn-tensor-testgen",
  "burn-tensor",
  "burn-train",
@ -53,6 +54,11 @@ syn = "2.0"
 tempfile = "3.5.0"
 thiserror = "1.0.40"
 topological-sort = "0.2.2"
+
+# WGPU stuff
+wgpu = "0.16.0"
+futures-intrusive = "0.5"
+pollster = "0.3"
 #
 # The following packages disable the "std" feature for no_std compatibility
 #
--- a/burn-core/src/nn/attention/mask.rs
+++ b/burn-core/src/nn/attention/mask.rs
@ -19,7 +19,7 @@ pub fn generate_autoregressive_mask<B: Backend>(

    mask = mask.to_device(device).repeat(0, batch_size);

-    mask.equal_elem(1_i64)
+    mask.equal_elem(1_i64.elem::<i64>())
 }

 pub struct GeneratePaddingMask<B: Backend> {
--- a/burn-tensor/src/tensor/api/base.rs
+++ b/burn-tensor/src/tensor/api/base.rs
@ -256,12 +256,6 @@ where
        K::equal(self.primitive, other.primitive)
    }

-    /// Applies element wise equal comparison and returns a boolean tensor.
-    pub fn equal_elem<E: Into<K::Elem>>(self, other: E) -> Tensor<B, D, Bool> {
-        let elem: K::Elem = other.into();
-        K::equal_elem::<D>(self.primitive, elem)
-    }
-
    /// Concatenates all tensors into a new one along the given dimension.
    ///
    /// # Panics
@ -400,7 +394,6 @@ pub trait BasicOps<B: Backend>: TensorKind<B> {
        lhs: Self::Primitive<D>,
        rhs: Self::Primitive<D>,
    ) -> Tensor<B, D, Bool>;
-    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool>;
    fn elem_type_name() -> &'static str {
        core::any::type_name::<Self::Elem>()
    }
@ -478,10 +471,6 @@ impl<B: Backend> BasicOps<B> for Float {
    ) -> Tensor<B, D, Bool> {
        Tensor::new(B::equal(lhs, rhs))
    }
-
-    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool> {
-        Tensor::new(B::equal_elem(lhs, rhs))
-    }
 }

 impl<B: Backend> BasicOps<B> for Int {
@ -553,10 +542,6 @@ impl<B: Backend> BasicOps<B> for Int {
        Tensor::new(B::int_equal(lhs, rhs))
    }

-    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool> {
-        Tensor::new(B::int_equal_elem(lhs, rhs))
-    }
-
    fn cat<const D: usize>(vectors: Vec<Self::Primitive<D>>, dim: usize) -> Self::Primitive<D> {
        B::int_cat(vectors, dim)
    }
@ -631,10 +616,6 @@ impl<B: Backend> BasicOps<B> for Bool {
        Tensor::new(B::bool_equal(lhs, rhs))
    }

-    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool> {
-        Tensor::new(B::bool_equal_elem(lhs, rhs))
-    }
-
    fn cat<const D: usize>(vectors: Vec<Self::Primitive<D>>, dim: usize) -> Self::Primitive<D> {
        B::bool_cat(vectors, dim)
    }
--- a/burn-tensor/src/tensor/api/numeric.rs
+++ b/burn-tensor/src/tensor/api/numeric.rs
@ -133,6 +133,11 @@ where
        Self::new(K::sum_dim(self.primitive, dim))
    }

+    /// Applies element wise equal comparison and returns a boolean tensor.
+    pub fn equal_elem<E: Element>(self, other: E) -> Tensor<B, D, Bool> {
+        K::equal_elem::<D>(self.primitive, other.elem())
+    }
+
    /// Applies element wise greater comparison and returns a boolean tensor.
    ///
    /// # Panics
@ -413,6 +418,7 @@ where
    fn sum_dim<const D: usize>(tensor: Self::Primitive<D>, dim: usize) -> Self::Primitive<D>;
    fn mean<const D: usize>(tensor: Self::Primitive<D>) -> Self::Primitive<1>;
    fn mean_dim<const D: usize>(tensor: Self::Primitive<D>, dim: usize) -> Self::Primitive<D>;
+    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool>;
    fn greater<const D: usize>(
        lhs: Self::Primitive<D>,
        rhs: Self::Primitive<D>,
@ -559,6 +565,9 @@ impl<B: Backend> Numeric<B> for Int {
        B::int_mean_dim(tensor, dim)
    }

+    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool> {
+        Tensor::new(B::int_equal_elem(lhs, rhs))
+    }
    fn greater<const D: usize>(
        lhs: Self::Primitive<D>,
        rhs: Self::Primitive<D>,
@ -777,6 +786,9 @@ impl<B: Backend> Numeric<B> for Float {
        B::mean_dim(tensor, dim)
    }

+    fn equal_elem<const D: usize>(lhs: Self::Primitive<D>, rhs: Self::Elem) -> Tensor<B, D, Bool> {
+        Tensor::new(B::equal_elem(lhs, rhs))
+    }
    fn greater<const D: usize>(
        lhs: Self::Primitive<D>,
        rhs: Self::Primitive<D>,
--- a/burn-tensor/src/tensor/backend/base.rs
+++ b/burn-tensor/src/tensor/backend/base.rs
@ -79,7 +79,7 @@ pub trait Backend:
    /// Tensor primitive to be used for all int operations.
    type IntTensorPrimitive<const D: usize>: Clone + Send + Sync + 'static + core::fmt::Debug;
    /// Int element type.
-    type IntElem: Element + From<i64> + Into<i64>;
+    type IntElem: Element;

    /// Tensor primitive to be used for all bool operations.
    type BoolTensorPrimitive<const D: usize>: Clone + Send + Sync + 'static + core::fmt::Debug;
--- a/burn-tensor/src/tensor/ops/tensor.rs
+++ b/burn-tensor/src/tensor/ops/tensor.rs
@ -22,7 +22,9 @@ pub trait TensorOps<B: Backend> {
    }
    fn shape<const D: usize>(tensor: &B::TensorPrimitive<D>) -> Shape<D>;
    fn to_data<const D: usize>(tensor: &B::TensorPrimitive<D>) -> Data<B::FloatElem, D>;
-    fn into_data<const D: usize>(tensor: B::TensorPrimitive<D>) -> Data<B::FloatElem, D>;
+    fn into_data<const D: usize>(tensor: B::TensorPrimitive<D>) -> Data<B::FloatElem, D> {
+        Self::to_data(&tensor)
+    }
    fn device<const D: usize>(tensor: &B::TensorPrimitive<D>) -> B::Device;
    fn to_device<const D: usize>(
        tensor: B::TensorPrimitive<D>,
@ -102,7 +104,9 @@ pub trait TensorOps<B: Backend> {
        lhs: B::TensorPrimitive<D>,
        rhs: B::TensorPrimitive<D>,
    ) -> B::TensorPrimitive<D>;
-    fn neg<const D: usize>(tensor: B::TensorPrimitive<D>) -> B::TensorPrimitive<D>;
+    fn neg<const D: usize>(tensor: B::TensorPrimitive<D>) -> B::TensorPrimitive<D> {
+        Self::mul_scalar(tensor, (-1.0_f32).elem::<B::FloatElem>())
+    }
    fn transpose<const D: usize>(tensor: B::TensorPrimitive<D>) -> B::TensorPrimitive<D> {
        Self::swap_dims(tensor, D - 2, D - 1)
    }
--- a/burn-tensor/src/tests/ops/add.rs
+++ b/burn-tensor/src/tests/ops/add.rs
@ -15,4 +15,17 @@ mod tests {
        let data_expected = Data::from([[6.0, 8.0, 10.0], [12.0, 14.0, 16.0]]);
        assert_eq!(data_expected, data_actual);
    }
+
+    #[test]
+    fn test_add_broadcast() {
+        let data_1 = Data::from([[0.0, 1.0, 2.0]]);
+        let data_2 = Data::from([[3.0, 4.0, 5.0], [6.0, 7.0, 8.0]]);
+        let tensor_1 = Tensor::<TestBackend, 2>::from_data(data_1);
+        let tensor_2 = Tensor::<TestBackend, 2>::from_data(data_2);
+
+        let data_actual = (tensor_1 + tensor_2).into_data();
+
+        let data_expected = Data::from([[3.0, 5.0, 7.0], [6.0, 8.0, 10.0]]);
+        assert_eq!(data_expected, data_actual);
+    }
 }
--- a/burn-tensor/src/tests/ops/log.rs
+++ b/burn-tensor/src/tests/ops/log.rs
@ -4,7 +4,7 @@ mod tests {
    use burn_tensor::{Data, Tensor};

    #[test]
-    fn should_support_exp_ops() {
+    fn should_support_log_ops() {
        let data = Data::from([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
        let tensor = Tensor::<TestBackend, 2>::from_data(data);

--- a/burn-wgpu/Cargo.toml
+++ b/burn-wgpu/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+authors = ["nathanielsimard <nathaniel.simard.42@gmail.com>"]
+categories = ["science"]
+description = "WGPU backend for burn"
+edition = "2021"
+keywords = ["deep-learning", "machine-learning", "data"]
+license = "MIT/Apache-2.0"
+name = "burn-wgpu"
+readme = "README.md"
+repository = "https://github.com/burn-rs/burn/tree/main/burn-wgpu"
+version = "0.8.0"
+
+[dependencies]
+burn-tensor = {path = "../burn-tensor", version = "0.8.0"}
+burn-common = {path = "../burn-common", version = "0.8.0"}
+derive-new = {workspace = true}
+bytemuck = {workspace = true}
+rand = {workspace = true}
+num-traits = {workspace = true}
+
+# WGPU stuff
+wgpu = {workspace = true}
+futures-intrusive = {workspace = true}
+pollster = {workspace = true}
+
+[dev-dependencies]
+burn-autodiff = {path = "../burn-autodiff", version = "0.8.0", default-features = false, features = [
+  "export_tests",
+]}
+burn-tensor = {path = "../burn-tensor", version = "0.8.0", default-features = false, features = [
+  "export_tests",
+]}
--- a/burn-wgpu/LICENSE-APACHE
+++ b/burn-wgpu/LICENSE-APACHE
@ -0,0 +1 @@
+../LICENSE-APACHE
--- a/burn-wgpu/LICENSE-MIT
+++ b/burn-wgpu/LICENSE-MIT
@ -0,0 +1 @@
+../LICENSE-MIT
--- a/burn-wgpu/README.md
+++ b/burn-wgpu/README.md
@ -0,0 +1,3 @@
+# Burn WGPU Backend
+
+[Burn](https://github.com/burn-rs/burn) WGPU backend
--- a/burn-wgpu/src/backend.rs
+++ b/burn-wgpu/src/backend.rs
@ -0,0 +1,45 @@
+use burn_tensor::backend::Backend;
+use rand::{rngs::StdRng, SeedableRng};
+
+use crate::{
+    element::{FloatElement, IntElement},
+    tensor::WGPUTensor,
+    GraphicsAPI, WGPUDevice,
+};
+use std::{marker::PhantomData, sync::Mutex};
+
+pub(crate) static SEED: Mutex<Option<StdRng>> = Mutex::new(None);
+
+#[derive(Debug, Default, Clone)]
+pub struct WGPUBackend<G: GraphicsAPI, F: FloatElement, I: IntElement> {
+    _g: PhantomData<G>,
+    _f: PhantomData<F>,
+    _i: PhantomData<I>,
+}
+
+impl<G: GraphicsAPI + 'static, F: FloatElement, I: IntElement> Backend for WGPUBackend<G, F, I> {
+    type Device = WGPUDevice;
+    type FullPrecisionBackend = WGPUBackend<G, f32, i32>;
+
+    type FullPrecisionElem = f32;
+    type FloatElem = F;
+    type IntElem = I;
+
+    type TensorPrimitive<const D: usize> = WGPUTensor<F, D>;
+    type IntTensorPrimitive<const D: usize> = WGPUTensor<I, D>;
+    type BoolTensorPrimitive<const D: usize> = WGPUTensor<u32, D>;
+
+    fn name() -> String {
+        String::from("wgpu")
+    }
+
+    fn seed(seed: u64) {
+        let rng = StdRng::seed_from_u64(seed);
+        let mut seed = SEED.lock().unwrap();
+        *seed = Some(rng);
+    }
+
+    fn ad_enabled() -> bool {
+        false
+    }
+}
--- a/burn-wgpu/src/context.rs
+++ b/burn-wgpu/src/context.rs
@ -0,0 +1,256 @@
+use burn_common::id::IdGenerator;
+use std::{
+    any::TypeId,
+    borrow::Cow,
+    collections::HashMap,
+    sync::{Arc, Mutex},
+};
+
+use wgpu::{
+    util::{BufferInitDescriptor, DeviceExt},
+    Buffer, DeviceDescriptor, DeviceType, ShaderModule, ShaderModuleDescriptor,
+};
+
+use crate::{kernel::KernelGenerator, GraphicsAPI, WGPUDevice};
+
+/// The context is the basic struct that allows to execute GPU kernel on devices.
+///
+/// You can access a context for a [wgpu device](WGPUDevice) using [get_context](crate::pool::get_context).
+#[derive(Debug)]
+pub struct Context {
+    id: String,
+    queue: wgpu::Queue,
+    device_wgpu: wgpu::Device,
+    cache: Mutex<HashMap<TypeId, Arc<ShaderModule>>>,
+    pub(crate) device: WGPUDevice,
+}
+
+#[derive(new, Clone, Debug)]
+pub struct WorkGroup {
+    pub x: u32,
+    pub y: u32,
+    pub z: u32,
+}
+
+impl Context {
+    pub(crate) fn new<G: GraphicsAPI>(device: &WGPUDevice) -> Self {
+        // Instantiates instance of WebGPU
+        let instance = wgpu::Instance::default();
+
+        // `request_adapter` instantiates the general connection to the GPU
+        let adapters = instance.enumerate_adapters(G::backend().into());
+        let mut adapters = adapters
+            .filter(|adapter| {
+                let device_type = adapter.get_info().device_type;
+                match device {
+                    WGPUDevice::DiscreteGPU(_) => device_type == DeviceType::DiscreteGpu,
+                    WGPUDevice::IntegratedGPU(_) => device_type == DeviceType::IntegratedGpu,
+                    WGPUDevice::VirtualGPU(_) => device_type == DeviceType::VirtualGpu,
+                    WGPUDevice::CPU => device_type == DeviceType::Cpu,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let adapter = match device {
+            WGPUDevice::DiscreteGPU(num) => {
+                assert!(adapters.len() > *num, "No Discrete GPU device found");
+                adapters.remove(*num)
+            }
+            WGPUDevice::IntegratedGPU(num) => {
+                assert!(adapters.len() > *num, "No Integrated GPU device found");
+                adapters.remove(*num)
+            }
+            WGPUDevice::VirtualGPU(num) => {
+                assert!(adapters.len() > *num, "No Virtual GPU device found");
+                adapters.remove(*num)
+            }
+            WGPUDevice::CPU => {
+                assert!(!adapters.is_empty(), "No CPU device found");
+                adapters.remove(0)
+            }
+        };
+
+        let device_wgpu = device.clone();
+        let (device, queue) = pollster::block_on(adapter.request_device(
+            &DeviceDescriptor {
+                label: None,
+                features: wgpu::Features::empty(),
+                limits: wgpu::Limits::downlevel_defaults(),
+            },
+            None,
+        ))
+        .expect("Unable to request the device with the adapter");
+
+        Self {
+            id: IdGenerator::generate(),
+            queue,
+            device_wgpu: device,
+            device: device_wgpu,
+            cache: Mutex::new(HashMap::new()),
+        }
+    }
+
+    /// Create a new buffer with the provided size.
+    pub fn create_buffer(&self, size: usize) -> Buffer {
+        self.device_wgpu.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: size as u64,
+            usage: wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::STORAGE
+                | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        })
+    }
+
+    /// Create a new buffer initialized with the provided bytes.
+    pub fn create_buffer_with_data(&self, data: &[u8]) -> Buffer {
+        let buffer_src = self.device_wgpu.create_buffer_init(&BufferInitDescriptor {
+            label: Some("Buffer Src"),
+            contents: data,
+            usage: wgpu::BufferUsages::COPY_SRC,
+        });
+
+        let buffer = self.create_buffer(buffer_src.size() as usize);
+
+        // Create a command encoder
+        let mut encoder =
+            self.device_wgpu
+                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                    label: Some("Command Encoder"),
+                });
+
+        // Copy data from the staging buffer to the target buffer
+        encoder.copy_buffer_to_buffer(&buffer_src, 0, &buffer, 0, buffer_src.size());
+
+        // Submit the command encoder to the queue
+        self.queue.submit(std::iter::once(encoder.finish()));
+
+        buffer
+    }
+
+    /// Read a buffer from the GPU and return its content as bytes.
+    pub fn buffer_to_data(&self, buffer: &Buffer) -> Vec<u8> {
+        let size = buffer.size();
+
+        let buffer_dest = self.device_wgpu.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Create a command encoder
+        let mut encoder =
+            self.device_wgpu
+                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                    label: Some("Command Encoder"),
+                });
+
+        encoder.copy_buffer_to_buffer(buffer, 0, &buffer_dest, 0, size);
+
+        self.queue.submit(std::iter::once(encoder.finish()));
+
+        let buffer_slice = buffer_dest.slice(..);
+        let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+        buffer_slice.map_async(wgpu::MapMode::Read, move |v| {
+            sender
+                .send(v)
+                .expect("Unable to send buffer slice result to async channel.")
+        });
+
+        self.device_wgpu.poll(wgpu::Maintain::Wait);
+
+        let result = pollster::block_on(receiver.receive());
+
+        if let Some(Ok(())) = result {
+            let data = buffer_slice.get_mapped_range();
+            let result = bytemuck::cast_slice(&data).to_vec();
+
+            drop(data);
+            buffer_dest.unmap();
+            result
+        } else {
+            panic!("Unable to read buffer {:?}", result)
+        }
+    }
+
+    /// Compile a kernel template if not present in the cache.
+    pub fn compile<K: KernelGenerator>(&self) -> Arc<ShaderModule> {
+        let mut cache = self.cache.lock().unwrap();
+        let template_id = TypeId::of::<K>();
+
+        if let Some(module) = cache.get(&template_id) {
+            return module.clone();
+        }
+
+        let source = K::generate();
+
+        let module = self
+            .device_wgpu
+            .create_shader_module(ShaderModuleDescriptor {
+                label: None,
+                source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(source.as_ref())),
+            });
+        let module = Arc::new(module);
+
+        cache.insert(template_id, module.clone());
+
+        module
+    }
+
+    /// Execute a kernel using the provided buffers.
+    ///
+    /// # Notes
+    ///
+    /// This function isn't safe, buffer can be mutated by the GPU. The users must ensure that a
+    /// buffer can be mutated when lauching a compute shaders with write access to a buffer.
+    ///
+    /// Buffer positions are used as bindings when lauching a compute kernel.
+    pub fn execute(&self, work_group: &WorkGroup, kernel: &ShaderModule, buffers: &[&Buffer]) {
+        let pipeline = self
+            .device_wgpu
+            .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: None,
+                layout: None,
+                module: kernel,
+                entry_point: "main",
+            });
+
+        let group_layout = pipeline.get_bind_group_layout(0);
+
+        let entries = buffers
+            .iter()
+            .enumerate()
+            .map(|(i, buffer)| wgpu::BindGroupEntry {
+                binding: i as u32,
+                resource: buffer.as_entire_binding(),
+            })
+            .collect::<Vec<_>>();
+
+        let bind_group = self
+            .device_wgpu
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: None,
+                layout: &group_layout,
+                entries: &entries,
+            });
+
+        let mut encoder = self
+            .device_wgpu
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        let mut compute = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: None });
+        compute.set_pipeline(&pipeline);
+        compute.set_bind_group(0, &bind_group, &[]);
+
+        compute.dispatch_workgroups(work_group.x, work_group.y, work_group.z);
+        std::mem::drop(compute);
+
+        self.queue.submit(Some(encoder.finish()));
+    }
+}
+
+impl PartialEq for Context {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
--- a/burn-wgpu/src/device.rs
+++ b/burn-wgpu/src/device.rs
@ -0,0 +1,25 @@
+/// The device struct when using the `wgpu` backend.
+///
+/// Note that you need to provide the device index when using a GPU backend.
+///
+/// # Example
+///
+/// ```no_run
+/// use burn_wgpu::WGPUDevice;
+///
+/// let device_gpu_1 = WGPUDevice::DiscreteGPU(0); // First discrete GPU found.
+/// let device_gpu_2 = WGPUDevice::DiscreteGPU(1);  // Second discrete GPU found.
+/// ```
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub enum WGPUDevice {
+    DiscreteGPU(usize),
+    IntegratedGPU(usize),
+    VirtualGPU(usize),
+    CPU,
+}
+
+impl Default for WGPUDevice {
+    fn default() -> Self {
+        Self::CPU
+    }
+}
--- a/burn-wgpu/src/element.rs
+++ b/burn-wgpu/src/element.rs
@ -0,0 +1,66 @@
+use burn_tensor::Element;
+
+pub trait WGPUElement: core::fmt::Debug + 'static + Clone
+where
+    Self: Sized,
+{
+    fn type_name() -> &'static str;
+    fn as_bytes(slice: &[Self]) -> &[u8];
+    fn from_bytes(bytes: &[u8]) -> &[Self];
+}
+
+pub trait FloatElement: WGPUElement + Element {}
+
+pub trait IntElement: WGPUElement + Element {}
+
+impl WGPUElement for u32 {
+    fn type_name() -> &'static str {
+        "u32"
+    }
+    fn as_bytes(slice: &[Self]) -> &[u8] {
+        bytemuck::cast_slice(slice)
+    }
+    fn from_bytes(bytes: &[u8]) -> &[Self] {
+        bytemuck::cast_slice(bytes)
+    }
+}
+
+impl WGPUElement for i32 {
+    fn type_name() -> &'static str {
+        "i32"
+    }
+    fn as_bytes(slice: &[Self]) -> &[u8] {
+        bytemuck::cast_slice(slice)
+    }
+    fn from_bytes(bytes: &[u8]) -> &[Self] {
+        bytemuck::cast_slice(bytes)
+    }
+}
+
+impl WGPUElement for i64 {
+    fn type_name() -> &'static str {
+        "i64"
+    }
+    fn as_bytes(slice: &[Self]) -> &[u8] {
+        bytemuck::cast_slice(slice)
+    }
+    fn from_bytes(bytes: &[u8]) -> &[Self] {
+        bytemuck::cast_slice(bytes)
+    }
+}
+
+impl WGPUElement for f32 {
+    fn type_name() -> &'static str {
+        "f32"
+    }
+    fn as_bytes(slice: &[Self]) -> &[u8] {
+        bytemuck::cast_slice(slice)
+    }
+    fn from_bytes(bytes: &[u8]) -> &[Self] {
+        bytemuck::cast_slice(bytes)
+    }
+}
+
+impl FloatElement for f32 {}
+impl IntElement for i32 {}
+impl IntElement for i64 {}
--- a/burn-wgpu/src/graphics.rs
+++ b/burn-wgpu/src/graphics.rs
@ -0,0 +1,61 @@
+/// The basic trait to specify which graphics API to use as Backend.
+///
+/// Options are:
+///   - [Vulkan](Vulkan)
+///   - [Metal](Metal)
+///   - [OpenGL](OpenGL)
+///   - [DirectX 11](Dx11)
+///   - [DirectX 12](Dx12)
+///   - [WebGPU](WebGPU)
+pub trait GraphicsAPI: Send + Sync + core::fmt::Debug + Default + Clone + 'static {
+    fn backend() -> wgpu::Backend;
+}
+
+#[derive(Default, Debug, Clone)]
+pub struct Vulkan;
+#[derive(Default, Debug, Clone)]
+pub struct Metal;
+#[derive(Default, Debug, Clone)]
+pub struct OpenGL;
+#[derive(Default, Debug, Clone)]
+pub struct Dx11;
+#[derive(Default, Debug, Clone)]
+pub struct Dx12;
+#[derive(Default, Debug, Clone)]
+pub struct WebGPU;
+
+impl GraphicsAPI for Vulkan {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::Vulkan
+    }
+}
+
+impl GraphicsAPI for Metal {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::Metal
+    }
+}
+
+impl GraphicsAPI for OpenGL {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::Gl
+    }
+}
+
+impl GraphicsAPI for Dx11 {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::Dx11
+    }
+}
+
+impl GraphicsAPI for Dx12 {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::Dx12
+    }
+}
+
+impl GraphicsAPI for WebGPU {
+    fn backend() -> wgpu::Backend {
+        wgpu::Backend::BrowserWebGpu
+    }
+}
--- a/burn-wgpu/src/kernel/base.rs
+++ b/burn-wgpu/src/kernel/base.rs
@ -0,0 +1,87 @@
+use crate::element::WGPUElement;
+use std::marker::PhantomData;
+
+/// Generate wgpu kernel source code to create [compute shader modules](wgpu::ShaderModule).
+pub trait KernelGenerator: 'static {
+    /// Source code concrete type.
+    type Source: AsRef<str>;
+
+    /// Generate the source code.
+    fn generate() -> Self::Source;
+}
+
+#[macro_export]
+macro_rules! kernel_wgsl {
+    (
+        $struct:ident,
+        $file:expr
+    ) => {
+        #[derive(new)]
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = &'static str;
+
+            fn generate() -> Self::Source {
+                include_str!($file)
+            }
+        }
+    };
+}
+
+/// Generate kernel source code by replacing some information using templating.
+pub struct KernelSettings<
+    K: KernelGenerator,
+    E: WGPUElement,
+    I: WGPUElement,
+    const WORKGROUP_X_SIZE: usize,
+    const WORKGROUP_Y_SIZE: usize,
+    const WORKGROUP_Z_SIZE: usize,
+> {
+    _k: PhantomData<K>,
+    _e: PhantomData<E>,
+    _i: PhantomData<I>,
+}
+
+impl<
+        K: KernelGenerator,
+        E: WGPUElement,
+        I: WGPUElement,
+        const WORKGROUP_X_SIZE: usize,
+        const WORKGROUP_Y_SIZE: usize,
+        const WORKGROUP_Z_SIZE: usize,
+    > KernelGenerator
+    for KernelSettings<K, E, I, WORKGROUP_X_SIZE, WORKGROUP_Y_SIZE, WORKGROUP_Z_SIZE>
+{
+    type Source = String;
+
+    fn generate() -> String {
+        let mut source = K::generate().as_ref().to_string();
+
+        source = source.replace("WORKGROUP_SIZE_X", &WORKGROUP_X_SIZE.to_string());
+        source = source.replace("WORKGROUP_SIZE_Y", &WORKGROUP_Y_SIZE.to_string());
+        source = source.replace("WORKGROUP_SIZE_Z", &WORKGROUP_Y_SIZE.to_string());
+        source = source.replace("elem", E::type_name());
+        source = source.replace("int", I::type_name());
+
+        source
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use core::any::TypeId;
+
+    #[test]
+    fn test_kernel_type_id() {
+        kernel_wgsl!(Add, "../template/binary_elemwise.wgsl");
+
+        let type_id_1 = TypeId::of::<KernelSettings<Add, f32, i32, 2, 3, 4>>();
+        let type_id_2 = TypeId::of::<KernelSettings<Add, f32, i32, 2, 3, 5>>();
+        let type_id_3 = TypeId::of::<KernelSettings<Add, f32, i32, 2, 3, 4>>();
+
+        assert_ne!(type_id_1, type_id_2);
+        assert_eq!(type_id_1, type_id_3);
+    }
+}
--- a/burn-wgpu/src/kernel/binary_elemwise.rs
+++ b/burn-wgpu/src/kernel/binary_elemwise.rs
@ -0,0 +1,159 @@
+use super::{KernelGenerator, KernelSettings};
+use crate::{context::WorkGroup, element::WGPUElement, kernel_wgsl, tensor::WGPUTensor};
+use burn_tensor::Shape;
+use num_traits::ToPrimitive;
+use std::sync::Arc;
+
+kernel_wgsl!(BinaryElemwiseRaw, "../template/binary_elemwise.wgsl");
+kernel_wgsl!(
+    BinaryElemwiseInplaceRaw,
+    "../template/binary_elemwise_inplace.wgsl"
+);
+
+#[macro_export]
+macro_rules! binary_elemwise {
+    (
+        $struct:ident,
+        $ops:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::BinaryElemwiseRaw::generate().to_string();
+                let body = format!(
+                    "output[global_id.x] = lhs[index_lhs] {} rhs[index_rhs]",
+                    $ops
+                );
+                source.replace("BODY", &body)
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! binary_elemwise_inplace {
+    (
+        $struct:ident,
+        $ops:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::BinaryElemwiseInplaceRaw::generate().to_string();
+                let body = format!(
+                    "lhs[global_id.x] = lhs[global_id.x] {} rhs[index_rhs];",
+                    $ops
+                );
+                source.replace("BODY", &body)
+            }
+        }
+    };
+}
+
+pub fn binary_elemwise<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    lhs: WGPUTensor<E, D>,
+    rhs: WGPUTensor<E, D>,
+) -> WGPUTensor<E, D> {
+    lhs.assert_is_on_save_device(&rhs);
+
+    let mut shape_out = [0; D];
+    lhs.shape
+        .dims
+        .iter()
+        .zip(rhs.shape.dims.iter())
+        .enumerate()
+        .for_each(|(index, (dim_lhs, dim_rhs))| {
+            shape_out[index] = usize::max(*dim_lhs, *dim_rhs);
+        });
+
+    let shape_out = Shape::new(shape_out);
+
+    let buffer = lhs
+        .context
+        .create_buffer(shape_out.num_elements() * core::mem::size_of::<E>());
+    let output = WGPUTensor::new(lhs.context.clone(), shape_out, Arc::new(buffer));
+
+    let kernel = lhs
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+    let mut info: Vec<u32> = vec![D.to_u32().unwrap()];
+
+    lhs.strides
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    rhs.strides
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    lhs.shape
+        .dims
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    rhs.shape
+        .dims
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    let info_buffers = lhs
+        .context
+        .create_buffer_with_data(bytemuck::cast_slice(&info));
+
+    lhs.context.execute(
+        &WorkGroup::new(
+            f32::ceil(output.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&lhs.buffer, &rhs.buffer, &output.buffer, &info_buffers],
+    );
+
+    output
+}
+pub fn binary_elemwise_inplace<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    lhs: WGPUTensor<E, D>,
+    rhs: WGPUTensor<E, D>,
+) -> WGPUTensor<E, D> {
+    lhs.assert_is_on_save_device(&rhs);
+
+    let mut shape_out = [0; D];
+    lhs.shape
+        .dims
+        .iter()
+        .zip(rhs.shape.dims.iter())
+        .enumerate()
+        .for_each(|(index, (dim_lhs, dim_rhs))| {
+            shape_out[index] = usize::max(*dim_lhs, *dim_rhs);
+        });
+
+    let kernel = lhs
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+    let mut info: Vec<u32> = vec![D.to_u32().unwrap()];
+    rhs.strides
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    rhs.shape
+        .dims
+        .into_iter()
+        .for_each(|v| info.push(v.to_u32().unwrap()));
+    let info_buffers = lhs
+        .context
+        .create_buffer_with_data(bytemuck::cast_slice(&info));
+
+    lhs.context.execute(
+        &WorkGroup::new(
+            f32::ceil(lhs.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&lhs.buffer, &rhs.buffer, &info_buffers],
+    );
+
+    lhs
+}
--- a/burn-wgpu/src/kernel/mod.rs
+++ b/burn-wgpu/src/kernel/mod.rs
@ -0,0 +1,9 @@
+mod base;
+mod binary_elemwise;
+mod unary;
+mod unary_scalar;
+
+pub use base::*;
+pub use binary_elemwise::*;
+pub use unary::*;
+pub use unary_scalar::*;
--- a/burn-wgpu/src/kernel/unary.rs
+++ b/burn-wgpu/src/kernel/unary.rs
@ -0,0 +1,120 @@
+use super::{KernelGenerator, KernelSettings};
+use crate::{context::WorkGroup, element::WGPUElement, kernel_wgsl, tensor::WGPUTensor};
+use std::sync::Arc;
+
+kernel_wgsl!(UnaryRaw, "../template/unary.wgsl");
+kernel_wgsl!(UnaryInplaceRaw, "../template/unary_inplace.wgsl");
+
+#[macro_export]
+macro_rules! unary {
+    (
+        $struct:ident,
+        func $func:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryRaw::generate().to_string();
+                let body = format!("output[global_id.x] = {}(input[global_id.x]);", $func);
+                source.replace("BODY", &body)
+            }
+        }
+    };
+    (
+        $struct:ident,
+        body $body:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryRaw::generate().to_string();
+                source.replace("BODY", $body)
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! unary_inplace {
+    (
+        $struct:ident,
+        func $func:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryInplaceRaw::generate().to_string();
+                let body = format!("input[global_id.x] = {}(input[global_id.x]);", $func);
+                source.replace("BODY", &body)
+            }
+        }
+    };
+    (
+        $struct:ident,
+        body $body:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryInplaceRaw::generate().to_string();
+                source.replace("BODY", $body)
+            }
+        }
+    };
+}
+
+pub fn unary<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    input: WGPUTensor<E, D>,
+) -> WGPUTensor<E, D> {
+    let buffer = input
+        .context
+        .create_buffer(input.shape.num_elements() * core::mem::size_of::<E>());
+    let output = WGPUTensor::new(input.context.clone(), input.shape, Arc::new(buffer));
+    let kernel = input
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+
+    input.context.execute(
+        &WorkGroup::new(
+            f32::ceil(output.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&input.buffer, &output.buffer],
+    );
+
+    output
+}
+
+pub fn unary_inplace<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    input: WGPUTensor<E, D>,
+) -> WGPUTensor<E, D> {
+    let kernel = input
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+
+    input.context.execute(
+        &WorkGroup::new(
+            f32::ceil(input.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&input.buffer],
+    );
+
+    input
+}
--- a/burn-wgpu/src/kernel/unary_scalar.rs
+++ b/burn-wgpu/src/kernel/unary_scalar.rs
@ -0,0 +1,135 @@
+use super::{KernelGenerator, KernelSettings};
+use crate::{context::WorkGroup, element::WGPUElement, kernel_wgsl, tensor::WGPUTensor};
+use std::sync::Arc;
+
+kernel_wgsl!(UnaryScalarRaw, "../template/unary_scalar.wgsl");
+kernel_wgsl!(
+    UnaryScalarInplaceRaw,
+    "../template/unary_scalar_inplace.wgsl"
+);
+
+#[macro_export]
+macro_rules! unary_scalar {
+    (
+        $struct:ident,
+        ops $ops:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryScalarRaw::generate().to_string();
+                let body = format!("output[global_id.x] = lhs[global_id.x] {} rhs;", $ops);
+
+                source.replace("BODY", &body)
+            }
+        }
+    };
+
+    (
+        $struct:ident,
+        func $func:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryScalarRaw::generate().to_string();
+                let body = format!("output[global_id.x] = {}(lhs[global_id.x], rhs);", $func);
+
+                source.replace("BODY", &body)
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! unary_scalar_inplace {
+    (
+        $struct:ident,
+        ops $ops:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryScalarInplaceRaw::generate().to_string();
+                let body = format!("lhs[global_id.x] = lhs[global_id.x] {} rhs;", $ops);
+
+                source.replace("BODY", &body)
+            }
+        }
+    };
+
+    (
+        $struct:ident,
+        func $func:expr
+    ) => {
+        pub struct $struct;
+
+        impl $crate::kernel::KernelGenerator for $struct {
+            type Source = String;
+
+            fn generate() -> Self::Source {
+                let source = $crate::kernel::UnaryScalarInplaceRaw::generate().to_string();
+                let body = format!("lhs[global_id.x] = {}(lhs[global_id.x], rhs);", $func);
+
+                source.replace("BODY", &body)
+            }
+        }
+    };
+}
+
+pub fn unary_scalar<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    lhs: WGPUTensor<E, D>,
+    scalar: E,
+) -> WGPUTensor<E, D> {
+    let buffer = lhs
+        .context
+        .create_buffer(lhs.shape.num_elements() * core::mem::size_of::<E>());
+    let output = WGPUTensor::new(lhs.context.clone(), lhs.shape, Arc::new(buffer));
+    let kernel = lhs
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+    let rhs_buffer = lhs.context.create_buffer_with_data(E::as_bytes(&[scalar]));
+
+    lhs.context.execute(
+        &WorkGroup::new(
+            f32::ceil(output.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&lhs.buffer, &rhs_buffer, &output.buffer],
+    );
+
+    output
+}
+
+pub fn unary_scalar_inplace<K: KernelGenerator, E: WGPUElement, const D: usize>(
+    lhs: WGPUTensor<E, D>,
+    scalar: E,
+) -> WGPUTensor<E, D> {
+    let kernel = lhs
+        .context
+        .compile::<KernelSettings<K, E, i32, 256, 1, 1>>();
+    let rhs_buffer = lhs.context.create_buffer_with_data(E::as_bytes(&[scalar]));
+
+    lhs.context.execute(
+        &WorkGroup::new(
+            f32::ceil(lhs.shape.num_elements() as f32 / 256_f32) as u32,
+            1,
+            1,
+        ),
+        &kernel,
+        &[&lhs.buffer, &rhs_buffer],
+    );
+
+    lhs
+}
--- a/burn-wgpu/src/lib.rs
+++ b/burn-wgpu/src/lib.rs
@ -0,0 +1,39 @@
+#[macro_use]
+extern crate derive_new;
+
+mod ops;
+
+pub(crate) mod context;
+pub(crate) mod element;
+pub(crate) mod kernel;
+pub(crate) mod pool;
+pub(crate) mod tensor;
+
+mod device;
+pub use device::*;
+
+mod backend;
+pub use backend::*;
+
+mod graphics;
+pub use graphics::*;
+
+#[cfg(test)]
+mod tests {
+    type TestBackend = crate::WGPUBackend<crate::Vulkan, f32, i64>;
+
+    burn_tensor::testgen_add!();
+    burn_tensor::testgen_sub!();
+    burn_tensor::testgen_div!();
+    burn_tensor::testgen_mul!();
+    burn_tensor::testgen_neg!();
+    burn_tensor::testgen_powf!();
+    burn_tensor::testgen_exp!();
+    burn_tensor::testgen_log!();
+    burn_tensor::testgen_relu!();
+
+    // Once all operations will be implemented.
+    // type TestTensor<const D: usize> = burn_tensor::Tensor<TestBackend, D>;
+    // type TestTensorInt<const D: usize> = burn_tensor::Tensor<TestBackend, D, burn_tensor::Int>;
+    // burn_tensor::testgen_all!();
+}
--- a/burn-wgpu/src/ops/activation_ops.rs
+++ b/burn-wgpu/src/ops/activation_ops.rs
@ -0,0 +1,27 @@
+use burn_tensor::ops::ActivationOps;
+
+use crate::{
+    element::{FloatElement, IntElement},
+    kernel::{unary, unary_inplace},
+    unary, unary_inplace, GraphicsAPI, WGPUBackend,
+};
+
+use super::FloatTensor;
+
+impl<G, F, I> ActivationOps<WGPUBackend<G, F, I>> for WGPUBackend<G, F, I>
+where
+    G: GraphicsAPI + 'static,
+    F: FloatElement,
+    I: IntElement,
+{
+    fn relu<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
+        unary!(Relu, body "output[global_id.x] = max(input[global_id.x], 0.0);");
+        unary_inplace!(ReluInplace, body "input[global_id.x] = max(input[global_id.x], 0.0);");
+
+        if tensor.can_mut() {
+            return unary_inplace::<ReluInplace, F, D>(tensor);
+        }
+
+        unary::<Relu, F, D>(tensor)
+    }
+}
--- a/burn-wgpu/src/ops/base.rs
+++ b/burn-wgpu/src/ops/base.rs
@ -0,0 +1,59 @@
+use std::{marker::PhantomData, sync::Arc};
+
+use burn_tensor::{backend::Backend, Data, Shape};
+
+use crate::{element::WGPUElement, pool::get_context, tensor::WGPUTensor, GraphicsAPI, WGPUDevice};
+
+pub type FloatElem<B> = <B as Backend>::FloatElem;
+pub type Device<B> = <B as Backend>::Device;
+
+pub type FloatTensor<B, const D: usize> = <B as Backend>::TensorPrimitive<D>;
+
+pub type IntElem<B> = <B as Backend>::IntElem;
+pub type IntTensor<B, const D: usize> = <B as Backend>::IntTensorPrimitive<D>;
+pub type BoolTensor<B, const D: usize> = <B as Backend>::BoolTensorPrimitive<D>;
+
+pub struct BaseOps<G: GraphicsAPI> {
+    _g: PhantomData<G>,
+}
+
+impl<G: GraphicsAPI> BaseOps<G> {
+    pub fn from_data<E: WGPUElement, const D: usize>(
+        data: Data<E, D>,
+        device: &WGPUDevice,
+    ) -> WGPUTensor<E, D> {
+        let context = get_context::<G>(device);
+        let buffer = context.create_buffer_with_data(E::as_bytes(&data.value));
+
+        WGPUTensor::new(context, data.shape, Arc::new(buffer))
+    }
+
+    pub fn to_data<E: WGPUElement, const D: usize>(tensor: &WGPUTensor<E, D>) -> Data<E, D> {
+        let bytes = tensor.context.buffer_to_data(&tensor.buffer);
+        let values = E::from_bytes(&bytes);
+
+        Data::new(values.to_vec(), tensor.shape.clone())
+    }
+
+    pub fn to_device<E: WGPUElement, const D: usize>(
+        tensor: WGPUTensor<E, D>,
+        device: &WGPUDevice,
+    ) -> WGPUTensor<E, D> {
+        if &tensor.context.device == device {
+            return tensor;
+        }
+
+        let context = get_context::<G>(device);
+        tensor.to_context(context)
+    }
+
+    pub fn empty<E: WGPUElement, const D: usize>(
+        shape: Shape<D>,
+        device: &WGPUDevice,
+    ) -> WGPUTensor<E, D> {
+        let context = get_context::<G>(device);
+        let buffer = context.create_buffer(shape.num_elements() * core::mem::size_of::<E>());
+
+        WGPUTensor::new(context, shape, Arc::new(buffer))
+    }
+}
--- a/burn-wgpu/src/ops/bool_ops.rs
+++ b/burn-wgpu/src/ops/bool_ops.rs
@ -0,0 +1,106 @@
+use burn_tensor::{backend::Backend, ops::BoolTensorOps, Data, Shape};
+
+use crate::{
+    element::{FloatElement, IntElement},
+    GraphicsAPI, WGPUBackend,
+};
+
+use super::{BaseOps, BoolTensor, Device, IntTensor};
+
+impl<G, F, I> BoolTensorOps<WGPUBackend<G, F, I>> for WGPUBackend<G, F, I>
+where
+    G: GraphicsAPI + 'static,
+    F: FloatElement,
+    I: IntElement,
+{
+    fn bool_empty<const D: usize>(shape: Shape<D>, device: &Device<Self>) -> BoolTensor<Self, D> {
+        BaseOps::<G>::empty(shape, device)
+    }
+
+    fn bool_shape<const D: usize>(tensor: &BoolTensor<Self, D>) -> Shape<D> {
+        tensor.shape.clone()
+    }
+
+    fn bool_into_data<const D: usize>(tensor: BoolTensor<Self, D>) -> Data<bool, D> {
+        let data = BaseOps::<G>::to_data(&tensor);
+
+        Data::new(data.value.into_iter().map(|i| i != 0).collect(), data.shape)
+    }
+
+    fn bool_from_data<const D: usize>(
+        data: Data<bool, D>,
+        device: &Device<Self>,
+    ) -> BoolTensor<Self, D> {
+        let data: Data<u32, D> = Data::new(
+            data.value
+                .into_iter()
+                .map(|c| match c {
+                    true => 1,
+                    false => 0,
+                })
+                .collect(),
+            data.shape,
+        );
+        BaseOps::<G>::from_data(data, device)
+    }
+
+    fn bool_into_int<const D: usize>(_tensor: BoolTensor<Self, D>) -> IntTensor<Self, D> {
+        todo!()
+    }
+
+    fn bool_device<const D: usize>(
+        _tensor: &<WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::Device {
+        todo!()
+    }
+
+    fn bool_to_device<const D: usize>(
+        tensor: BoolTensor<Self, D>,
+        device: &Device<Self>,
+    ) -> BoolTensor<Self, D> {
+        BaseOps::<G>::to_device(tensor, device)
+    }
+
+    fn bool_reshape<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1>,
+        _shape: Shape<D2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D2> {
+        todo!()
+    }
+
+    fn bool_index<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn bool_index_assign<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+        _value: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn bool_cat<const D: usize>(
+        _tensors: Vec<<WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn bool_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn bool_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _rhs: bool,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+}
--- a/burn-wgpu/src/ops/float_ops.rs
+++ b/burn-wgpu/src/ops/float_ops.rs
@ -0,0 +1,406 @@
+use super::numeric::NumericOps;
+use super::{BaseOps, Device, FloatElem, FloatTensor};
+use crate::kernel::{unary, unary_inplace, unary_scalar, unary_scalar_inplace};
+use crate::{
+    element::{FloatElement, IntElement},
+    unary, unary_inplace, GraphicsAPI, WGPUBackend, SEED,
+};
+use crate::{unary_scalar, unary_scalar_inplace};
+use burn_common::rand::get_seeded_rng;
+use burn_tensor::ElementConversion;
+use burn_tensor::{backend::Backend, ops::TensorOps, Data, Distribution, Shape};
+
+impl<G, F, I> TensorOps<WGPUBackend<G, F, I>> for WGPUBackend<G, F, I>
+where
+    G: GraphicsAPI + 'static,
+    F: FloatElement,
+    I: IntElement,
+{
+    fn from_data<const D: usize>(
+        data: Data<FloatElem<Self>, D>,
+        device: &Device<Self>,
+    ) -> FloatTensor<Self, D> {
+        BaseOps::<G>::from_data(data, device)
+    }
+
+    fn random<const D: usize>(
+        shape: Shape<D>,
+        distribution: Distribution<FloatElem<Self>>,
+        device: &Device<Self>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        let mut seed = SEED.lock().unwrap();
+        let mut rng = if let Some(rng_seeded) = seed.as_ref() {
+            rng_seeded.clone()
+        } else {
+            get_seeded_rng()
+        };
+        let tensor = Self::from_data(Data::random(shape, distribution, &mut rng), device);
+        *seed = Some(rng);
+        tensor
+    }
+
+    fn shape<const D: usize>(tensor: &FloatTensor<Self, D>) -> Shape<D> {
+        tensor.shape.clone()
+    }
+
+    fn to_data<const D: usize>(tensor: &FloatTensor<Self, D>) -> Data<FloatElem<Self>, D> {
+        BaseOps::<G>::to_data(tensor)
+    }
+
+    fn device<const D: usize>(tensor: &FloatTensor<Self, D>) -> Device<Self> {
+        tensor.context.device.clone()
+    }
+
+    fn to_device<const D: usize>(
+        tensor: FloatTensor<Self, D>,
+        device: &Device<Self>,
+    ) -> FloatTensor<Self, D> {
+        BaseOps::<G>::to_device(tensor, device)
+    }
+
+    fn empty<const D: usize>(shape: Shape<D>, device: &Device<Self>) -> FloatTensor<Self, D> {
+        BaseOps::<G>::empty(shape, device)
+    }
+
+    fn add<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatTensor<Self, D>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::add(lhs, rhs)
+    }
+
+    fn add_scalar<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatElem<Self>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::add_scalar(lhs, rhs)
+    }
+
+    fn sub<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatTensor<Self, D>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::sub(lhs, rhs)
+    }
+
+    fn sub_scalar<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatElem<Self>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::sub_scalar(lhs, rhs)
+    }
+
+    fn mul<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatTensor<Self, D>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::mul(lhs, rhs)
+    }
+
+    fn mul_scalar<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatElem<Self>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::mul_scalar(lhs, rhs)
+    }
+
+    fn div<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatTensor<Self, D>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::div(lhs, rhs)
+    }
+
+    fn div_scalar<const D: usize>(
+        lhs: FloatTensor<Self, D>,
+        rhs: FloatElem<Self>,
+    ) -> FloatTensor<Self, D> {
+        NumericOps::div_scalar(lhs, rhs)
+    }
+
+    fn matmul<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn swap_dims<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim1: usize,
+        _dim2: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn reshape<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1>,
+        _shape: Shape<D2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D2> {
+        todo!()
+    }
+
+    fn gather<const D: usize>(
+        _dim: usize,
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn scatter<const D: usize>(
+        _dim: usize,
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _value: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn index_select<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim: usize,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn index_select_assign<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1>,
+        _dim: usize,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1>,
+        _value: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn index<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn index_assign<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+        _value: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn mask_scatter<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _mask: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _source: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn mask_fill<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _mask: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _value: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn greater<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn greater_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn greater_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn greater_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn lower<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn lower_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn lower_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn lower_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::FloatElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn sum<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<1> {
+        todo!()
+    }
+
+    fn sum_dim<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn mean<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<1> {
+        todo!()
+    }
+
+    fn mean_dim<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn to_full_precision<const D: usize>(
+        _tensor: &<WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <<WGPUBackend<G, F, I> as Backend>::FullPrecisionBackend as Backend>::TensorPrimitive<D>
+    {
+        todo!()
+    }
+
+    fn from_full_precision<const D: usize>(
+        _tensor: <<WGPUBackend<G, F, I> as Backend>::FullPrecisionBackend as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn exp<const D: usize>(lhs: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
+        unary!(Exp, func "exp");
+        unary_inplace!(ExpInplace, func "exp");
+
+        if lhs.can_mut() {
+            return unary_inplace::<ExpInplace, F, D>(lhs);
+        }
+
+        unary::<Exp, F, D>(lhs)
+    }
+
+    fn log<const D: usize>(tensor: FloatTensor<Self, D>) -> FloatTensor<Self, D> {
+        unary!(Log, func "log");
+        unary_inplace!(LogInplace, func "log");
+
+        if tensor.can_mut() {
+            return unary_inplace::<LogInplace, F, D>(tensor);
+        }
+
+        unary::<Log, F, D>(tensor)
+    }
+
+    fn log1p<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn powf<const D: usize>(lhs: FloatTensor<Self, D>, rhs: f32) -> FloatTensor<Self, D> {
+        unary_scalar!(Powf, func "pow");
+        unary_scalar_inplace!(PowfInplace, func "pow");
+
+        if lhs.can_mut() {
+            return unary_scalar_inplace::<PowfInplace, F, D>(lhs, rhs.elem());
+        }
+
+        unary_scalar::<Powf, F, D>(lhs, rhs.elem())
+    }
+
+    fn sqrt<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn cos<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn sin<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn tanh<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn erf<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn cat<const D: usize>(
+        _tensors: Vec<<WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D> {
+        todo!()
+    }
+
+    fn argmax<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn argmin<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+}
--- a/burn-wgpu/src/ops/int_ops.rs
+++ b/burn-wgpu/src/ops/int_ops.rs
@ -0,0 +1,314 @@
+use burn_tensor::{backend::Backend, ops::IntTensorOps, Data, Shape};
+
+use crate::{
+    element::{FloatElement, IntElement},
+    GraphicsAPI, WGPUBackend,
+};
+
+use super::{numeric::NumericOps, BaseOps, Device, IntElem, IntTensor};
+
+impl<G, F, I> IntTensorOps<WGPUBackend<G, F, I>> for WGPUBackend<G, F, I>
+where
+    G: GraphicsAPI + 'static,
+    F: FloatElement,
+    I: IntElement,
+{
+    fn int_empty<const D: usize>(shape: Shape<D>, device: &Device<Self>) -> IntTensor<Self, D> {
+        BaseOps::<G>::empty(shape, device)
+    }
+
+    fn int_shape<const D: usize>(
+        _tensor: &<WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> Shape<D> {
+        todo!()
+    }
+
+    fn int_into_data<const D: usize>(tensor: IntTensor<Self, D>) -> Data<I, D> {
+        BaseOps::<G>::to_data(&tensor)
+    }
+
+    fn int_from_data<const D: usize>(
+        data: Data<I, D>,
+        device: &Device<Self>,
+    ) -> IntTensor<Self, D> {
+        BaseOps::<G>::from_data(data, device)
+    }
+
+    fn int_device<const D: usize>(
+        _tensor: &<WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::Device {
+        todo!()
+    }
+
+    fn int_to_device<const D: usize>(
+        tensor: IntTensor<Self, D>,
+        device: &Device<Self>,
+    ) -> IntTensor<Self, D> {
+        BaseOps::<G>::to_device(tensor, device)
+    }
+
+    fn int_reshape<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1>,
+        _shape: Shape<D2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D2> {
+        todo!()
+    }
+
+    fn int_index<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn int_index_assign<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1>,
+        _indexes: [std::ops::Range<usize>; D2],
+        _value: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn int_mask_scatter<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _mask: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _source: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_mask_fill<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _mask: <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D>,
+        _value: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_gather<const D: usize>(
+        _dim: usize,
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_scatter<const D: usize>(
+        _dim: usize,
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _value: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_index_select_dim<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _dim: usize,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_index_select_dim_assign<const D1: usize, const D2: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1>,
+        _dim: usize,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1>,
+        _value: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D1> {
+        todo!()
+    }
+
+    fn int_cat<const D: usize>(
+        _tensors: Vec<<WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_greater<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_greater_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_greater_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_greater_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_lower<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_lower_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_lower_equal<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_lower_equal_elem<const D: usize>(
+        _lhs: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _rhs: <WGPUBackend<G, F, I> as Backend>::IntElem,
+    ) -> <WGPUBackend<G, F, I> as Backend>::BoolTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_add<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntTensor<Self, D>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::add::<I, D>(lhs, rhs)
+    }
+
+    fn int_add_scalar<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntElem<Self>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::add_scalar(lhs, rhs)
+    }
+
+    fn int_sub<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntTensor<Self, D>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::sub(lhs, rhs)
+    }
+
+    fn int_sub_scalar<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntElem<Self>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::sub_scalar(lhs, rhs)
+    }
+
+    fn int_mul<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntTensor<Self, D>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::mul(lhs, rhs)
+    }
+
+    fn int_mul_scalar<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntElem<Self>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::mul_scalar(lhs, rhs)
+    }
+
+    fn int_div<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntTensor<Self, D>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::div(lhs, rhs)
+    }
+
+    fn int_div_scalar<const D: usize>(
+        lhs: IntTensor<Self, D>,
+        rhs: IntElem<Self>,
+    ) -> IntTensor<Self, D> {
+        NumericOps::div_scalar(lhs, rhs)
+    }
+
+    fn int_neg<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_zeros<const D: usize>(
+        _shape: Shape<D>,
+        _device: &<WGPUBackend<G, F, I> as Backend>::Device,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_ones<const D: usize>(
+        _shape: Shape<D>,
+        _device: &<WGPUBackend<G, F, I> as Backend>::Device,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_sum<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1> {
+        todo!()
+    }
+
+    fn int_sum_dim<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_mean<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<1> {
+        todo!()
+    }
+
+    fn int_mean_dim<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_argmax<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+
+    fn int_argmin<const D: usize>(
+        _tensor: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D>,
+        _dim: usize,
+    ) -> <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<D> {
+        todo!()
+    }
+}
--- a/burn-wgpu/src/ops/mod.rs
+++ b/burn-wgpu/src/ops/mod.rs
@ -0,0 +1,10 @@
+mod activation_ops;
+mod bool_ops;
+mod float_ops;
+mod int_ops;
+mod module_ops;
+
+mod base;
+pub(crate) use base::*;
+
+pub(crate) mod numeric;
--- a/burn-wgpu/src/ops/module_ops.rs
+++ b/burn-wgpu/src/ops/module_ops.rs
@ -0,0 +1,94 @@
+use burn_tensor::{backend::Backend, ops::ModuleOps};
+
+use crate::{
+    element::{FloatElement, IntElement},
+    GraphicsAPI, WGPUBackend,
+};
+
+impl<G, F, I> ModuleOps<WGPUBackend<G, F, I>> for WGPUBackend<G, F, I>
+where
+    G: GraphicsAPI + 'static,
+    F: FloatElement,
+    I: IntElement,
+{
+    fn embedding(
+        _weights: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<2>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<3> {
+        todo!()
+    }
+
+    fn embedding_backward(
+        _weights: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<2>,
+        _output: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<3>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<2> {
+        todo!()
+    }
+
+    fn conv2d(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _weight: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _bias: Option<<WGPUBackend<G, F, I> as Backend>::TensorPrimitive<1>>,
+        _options: burn_tensor::ops::ConvOptions<2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4> {
+        todo!()
+    }
+
+    fn conv_transpose2d(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _weight: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _bias: Option<<WGPUBackend<G, F, I> as Backend>::TensorPrimitive<1>>,
+        _options: burn_tensor::ops::ConvTransposeOptions<2>,
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4> {
+        todo!()
+    }
+
+    fn avg_pool2d(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _kernel_size: [usize; 2],
+        _stride: [usize; 2],
+        _padding: [usize; 2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4> {
+        todo!()
+    }
+
+    fn avg_pool2d_backward(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _grad: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _kernel_size: [usize; 2],
+        _stride: [usize; 2],
+        _padding: [usize; 2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4> {
+        todo!()
+    }
+
+    fn max_pool2d(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _kernel_size: [usize; 2],
+        _stride: [usize; 2],
+        _padding: [usize; 2],
+    ) -> <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4> {
+        todo!()
+    }
+
+    fn max_pool2d_with_indexes(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _kernel_size: [usize; 2],
+        _stride: [usize; 2],
+        _padding: [usize; 2],
+    ) -> burn_tensor::ops::MaxPool2dWithIndexes<WGPUBackend<G, F, I>> {
+        todo!()
+    }
+
+    fn max_pool2d_with_indexes_backward(
+        _x: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _kernel_size: [usize; 2],
+        _stride: [usize; 2],
+        _padding: [usize; 2],
+        _output_grad: <WGPUBackend<G, F, I> as Backend>::TensorPrimitive<4>,
+        _indexes: <WGPUBackend<G, F, I> as Backend>::IntTensorPrimitive<4>,
+    ) -> burn_tensor::ops::MaxPool2dBackward<WGPUBackend<G, F, I>> {
+        todo!()
+    }
+}
--- a/burn-wgpu/src/ops/numeric.rs
+++ b/burn-wgpu/src/ops/numeric.rs
@ -0,0 +1,129 @@
+use crate::kernel::{binary_elemwise, binary_elemwise_inplace, unary_scalar, unary_scalar_inplace};
+use crate::{
+    binary_elemwise, binary_elemwise_inplace, element::WGPUElement, tensor::WGPUTensor,
+    unary_scalar, unary_scalar_inplace,
+};
+
+pub struct NumericOps;
+
+impl NumericOps {
+    pub fn add<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: WGPUTensor<E, D>,
+    ) -> WGPUTensor<E, D> {
+        binary_elemwise!(Add, "+");
+        binary_elemwise_inplace!(AddInplace, "+");
+
+        if lhs.can_mut_broadcast(&rhs) {
+            return binary_elemwise_inplace::<AddInplace, E, D>(lhs, rhs);
+        }
+
+        if rhs.can_mut_broadcast(&lhs) {
+            return binary_elemwise_inplace::<AddInplace, E, D>(rhs, lhs);
+        }
+
+        binary_elemwise::<Add, E, D>(lhs, rhs)
+    }
+
+    pub fn add_scalar<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: E,
+    ) -> WGPUTensor<E, D> {
+        unary_scalar!(AddScalar, ops "+");
+        unary_scalar_inplace!(AddScalarInplace, ops "+");
+
+        if lhs.can_mut() {
+            return unary_scalar_inplace::<AddScalarInplace, E, D>(lhs, rhs);
+        }
+
+        unary_scalar::<AddScalar, E, D>(lhs, rhs)
+    }
+
+    pub fn sub<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: WGPUTensor<E, D>,
+    ) -> WGPUTensor<E, D> {
+        binary_elemwise!(Sub, "-");
+        binary_elemwise_inplace!(SubInplace, "-");
+
+        if lhs.can_mut_broadcast(&rhs) {
+            return binary_elemwise_inplace::<SubInplace, E, D>(lhs, rhs);
+        }
+
+        binary_elemwise::<Sub, E, D>(lhs, rhs)
+    }
+
+    pub fn sub_scalar<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: E,
+    ) -> WGPUTensor<E, D> {
+        unary_scalar!(SubScalar, ops "-");
+        unary_scalar_inplace!(SubScalarInplace, ops "-");
+
+        if lhs.can_mut() {
+            return unary_scalar_inplace::<SubScalarInplace, E, D>(lhs, rhs);
+        }
+
+        unary_scalar::<SubScalar, E, D>(lhs, rhs)
+    }
+
+    pub fn mul<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: WGPUTensor<E, D>,
+    ) -> WGPUTensor<E, D> {
+        binary_elemwise!(Mul, "*");
+        binary_elemwise_inplace!(MulInplace, "*");
+
+        if lhs.can_mut_broadcast(&rhs) {
+            return binary_elemwise_inplace::<MulInplace, E, D>(lhs, rhs);
+        }
+
+        if rhs.can_mut_broadcast(&lhs) {
+            return binary_elemwise_inplace::<MulInplace, E, D>(rhs, lhs);
+        }
+
+        binary_elemwise::<Mul, E, D>(lhs, rhs)
+    }
+
+    pub fn mul_scalar<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: E,
+    ) -> WGPUTensor<E, D> {
+        unary_scalar!(MulScalar, ops "*");
+        unary_scalar_inplace!(MulScalarInplace, ops "*");
+
+        if lhs.can_mut() {
+            return unary_scalar_inplace::<MulScalarInplace, E, D>(lhs, rhs);
+        }
+
+        unary_scalar::<MulScalar, E, D>(lhs, rhs)
+    }
+
+    pub fn div<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: WGPUTensor<E, D>,
+    ) -> WGPUTensor<E, D> {
+        binary_elemwise!(Div, "/");
+        binary_elemwise_inplace!(DivInplace, "/");
+
+        if lhs.can_mut_broadcast(&rhs) {
+            return binary_elemwise_inplace::<DivInplace, E, D>(lhs, rhs);
+        }
+
+        binary_elemwise::<Div, E, D>(lhs, rhs)
+    }
+
+    pub fn div_scalar<E: WGPUElement, const D: usize>(
+        lhs: WGPUTensor<E, D>,
+        rhs: E,
+    ) -> WGPUTensor<E, D> {
+        unary_scalar!(DivScalar, ops "/");
+        unary_scalar_inplace!(DivScalarInplace, ops "/");
+
+        if lhs.can_mut() {
+            return unary_scalar_inplace::<DivScalarInplace, E, D>(lhs, rhs);
+        }
+
+        unary_scalar::<DivScalar, E, D>(lhs, rhs)
+    }
+}
--- a/burn-wgpu/src/pool.rs
+++ b/burn-wgpu/src/pool.rs
@ -0,0 +1,63 @@
+use crate::{context::Context, GraphicsAPI, WGPUDevice};
+use std::{
+    any::TypeId,
+    collections::HashMap,
+    sync::{Arc, Mutex},
+};
+
+static POOL_CONTEXT: Mutex<Option<ContextPool>> = Mutex::new(None);
+
+#[derive(Default)]
+struct ContextPool {
+    contexts: HashMap<Key, Arc<Context>>,
+}
+
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+struct Key {
+    api_id: TypeId,
+    device: WGPUDevice,
+}
+
+impl Key {
+    fn new<G: GraphicsAPI>(device: &WGPUDevice) -> Self {
+        Self {
+            api_id: TypeId::of::<G>(),
+            device: device.clone(),
+        }
+    }
+}
+
+/// Get a [context](Context) for the given [device](WGPUDevice).
+///
+/// # Notes
+///
+/// If a context already exist for the current [device](WGPUDevice), the same instance will be
+/// returned.
+pub fn get_context<G: GraphicsAPI>(device: &WGPUDevice) -> Arc<Context> {
+    let mut pool = POOL_CONTEXT.lock().unwrap();
+
+    let context = if let Some(pool) = pool.as_mut() {
+        // Fetch device in pool
+        match pool.contexts.get(&Key::new::<G>(device)) {
+            Some(context) => context.clone(),
+            None => {
+                // Init new device
+                let context = Arc::new(Context::new::<G>(device));
+                pool.contexts.insert(Key::new::<G>(device), context.clone());
+                context
+            }
+        }
+    } else {
+        // Initialize pool
+        let context = Arc::new(Context::new::<G>(device));
+        let mut new_pool = ContextPool::default();
+
+        new_pool
+            .contexts
+            .insert(Key::new::<G>(device), context.clone());
+        *pool = Some(new_pool);
+        context
+    };
+
+    context
+}
--- a/burn-wgpu/src/template/binary_elemwise.wgsl
+++ b/burn-wgpu/src/template/binary_elemwise.wgsl
@ -0,0 +1,35 @@
+@group(0)
+@binding(0)
+var<storage, read> lhs: array<elem>;
+
+@group(0)
+@binding(1)
+var<storage, read> rhs: array<elem>;
+
+@group(0)
+@binding(2)
+var<storage, read_write> output: array<elem>;
+
+@group(0)
+@binding(3)
+var<storage, read> info: array<u32>;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    let dim: u32 = info[0];
+    var index_lhs: u32 = 0u;
+    var index_rhs: u32 = 0u;
+
+    for (var i: u32 = 0u; i < dim; i++) {
+        let stride_lhs = info[i + 1u];
+        let stride_rhs = info[i + 1u * dim + 1u];
+        let shape_lhs = info[i + 2u * dim + 1u];
+        let shape_rhs = info[i + 3u * dim + 1u];
+
+        index_lhs += global_id.x / stride_lhs % shape_lhs * stride_lhs;
+        index_rhs += global_id.x / stride_rhs % shape_rhs * stride_rhs;
+    }
+
+    BODY
+}
--- a/burn-wgpu/src/template/binary_elemwise_inplace.wgsl
+++ b/burn-wgpu/src/template/binary_elemwise_inplace.wgsl
@ -0,0 +1,27 @@
+@group(0)
+@binding(0)
+var<storage, read_write> lhs: array<elem>;
+
+@group(0)
+@binding(1)
+var<storage, read> rhs: array<elem>;
+
+@group(0)
+@binding(2)
+var<storage, read> info: array<u32>;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    let dim: u32 = info[0];
+    var index_rhs: u32 = 0u;
+
+    for (var i: u32 = 0u; i < dim; i++) {
+        let stride_rhs = info[i + 1u];
+        let shape_rhs = info[i + 1u * dim + 1u];
+
+        index_rhs += global_id.x / stride_rhs % shape_rhs * stride_rhs;
+    }
+
+    BODY
+}
--- a/burn-wgpu/src/template/unary.wgsl
+++ b/burn-wgpu/src/template/unary.wgsl
@ -0,0 +1,13 @@
+@group(0)
+@binding(0)
+var<storage, read> input: array<elem>;
+
+@group(0)
+@binding(1)
+var<storage, read_write> output: array<elem>;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    BODY
+}
--- a/burn-wgpu/src/template/unary_inplace.wgsl
+++ b/burn-wgpu/src/template/unary_inplace.wgsl
@ -0,0 +1,9 @@
+@group(0)
+@binding(0)
+var<storage, read_write> input: array<elem>;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    BODY
+}
--- a/burn-wgpu/src/template/unary_scalar.wgsl
+++ b/burn-wgpu/src/template/unary_scalar.wgsl
@ -0,0 +1,17 @@
+@group(0)
+@binding(0)
+var<storage, read> lhs: array<elem>;
+
+@group(0)
+@binding(1)
+var<storage, read> rhs: elem;
+
+@group(0)
+@binding(2)
+var<storage, read_write> output: array<elem>;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    BODY
+}
--- a/burn-wgpu/src/template/unary_scalar_inplace.wgsl
+++ b/burn-wgpu/src/template/unary_scalar_inplace.wgsl
@ -0,0 +1,13 @@
+@group(0)
+@binding(0)
+var<storage, read_write> lhs: array<elem>;
+
+@group(0)
+@binding(1)
+var<storage, read> rhs: elem;
+
+@compute
+@workgroup_size(WORKGROUP_SIZE_X, 1, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    BODY
+}
--- a/burn-wgpu/src/tensor/base.rs
+++ b/burn-wgpu/src/tensor/base.rs
@ -0,0 +1,82 @@
+use burn_tensor::Shape;
+use std::{marker::PhantomData, sync::Arc};
+use wgpu::Buffer;
+
+use crate::{context::Context, element::WGPUElement};
+
+#[derive(Debug, Clone)]
+pub struct WGPUTensor<E: WGPUElement, const D: usize> {
+    pub(crate) context: Arc<Context>,
+    pub(crate) buffer: Arc<Buffer>,
+    pub(crate) shape: Shape<D>,
+    pub(crate) strides: [usize; D],
+    elem: PhantomData<E>,
+}
+
+impl<E: WGPUElement, const D: usize> WGPUTensor<E, D> {
+    pub fn new(context: Arc<Context>, shape: Shape<D>, buffer: Arc<Buffer>) -> Self {
+        let mut strides = [0; D];
+
+        let mut current = 1;
+        shape
+            .dims
+            .iter()
+            .enumerate()
+            .rev()
+            .for_each(|(index, val)| {
+                strides[index] = current;
+                current *= val;
+            });
+
+        Self {
+            context,
+            buffer,
+            shape,
+            strides,
+            elem: PhantomData::default(),
+        }
+    }
+    pub fn to_context(&self, context: Arc<Context>) -> Self {
+        let data = self.context.buffer_to_data(&self.buffer);
+        let buffer = Arc::new(context.create_buffer_with_data(&data));
+
+        Self {
+            context,
+            buffer,
+            shape: self.shape.clone(),
+            strides: self.strides,
+            elem: PhantomData::default(),
+        }
+    }
+    pub fn can_mut_broadcast(&self, tensor_other: &WGPUTensor<E, D>) -> bool {
+        if Arc::strong_count(&self.buffer) > 1 {
+            return false;
+        }
+
+        for i in 0..D {
+            // Output tensor will be different from the mutable tensor.
+            if self.shape.dims[i] < tensor_other.shape.dims[i] {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    pub fn can_mut(&self) -> bool {
+        if Arc::strong_count(&self.buffer) > 1 {
+            return false;
+        }
+
+        true
+    }
+
+    pub fn assert_is_on_save_device(&self, other: &Self) {
+        if self.context.device != other.context.device {
+            panic!(
+                "Both tensors should be on the same device {:?} != {:?}",
+                self.context.device, other.context.device
+            );
+        }
+    }
+}
--- a/burn-wgpu/src/tensor/mod.rs
+++ b/burn-wgpu/src/tensor/mod.rs
@ -0,0 +1,2 @@
+mod base;
+pub use base::*;