diff --git a/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md b/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md
index f82054bbf..e494dcd2f 100644
--- a/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md
+++ b/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md
@@ -194,7 +194,7 @@ impl<E: FloatElement> DynamicKernel for FusedMatmulAddRelu<E> {
 ```
 
 Subsequently, we'll go into implementing our custom backend trait for the WGPU backend.
-Note that we won't go into supporting the `fusion` feature flag in this tutorial, so 
+Note that we won't go into supporting the `fusion` feature flag in this tutorial, so
 we implement the trait for the raw `WgpuBackend` type.
 
 ```rust, ignore
diff --git a/burn-book/src/basic-workflow/data.md b/burn-book/src/basic-workflow/data.md
index 44a778143..01e8701fc 100644
--- a/burn-book/src/basic-workflow/data.md
+++ b/burn-book/src/basic-workflow/data.md
@@ -16,15 +16,15 @@ at `examples/guide/` [directory](https://github.com/tracel-ai/burn/tree/main/exa
 
 ```rust , ignore
 use burn::{
-    data::{dataloader::batcher::Batcher, dataset::vision::MNISTItem},
+    data::{dataloader::batcher::Batcher, dataset::vision::MnistItem},
     tensor::{backend::Backend, Data, ElementConversion, Int, Tensor},
 };
 
-pub struct MNISTBatcher<B: Backend> {
+pub struct MnistBatcher<B: Backend> {
     device: B::Device,
 }
 
-impl<B: Backend> MNISTBatcher<B> {
+impl<B: Backend> MnistBatcher<B> {
     pub fn new(device: B::Device) -> Self {
         Self { device }
     }
@@ -42,13 +42,13 @@ Next, we need to actually implement the batching logic.
 
 ```rust , ignore
 #[derive(Clone, Debug)]
-pub struct MNISTBatch<B: Backend> {
+pub struct MnistBatch<B: Backend> {
     pub images: Tensor<B, 3>,
     pub targets: Tensor<B, 1, Int>,
 }
 
-impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
-    fn batch(&self, items: Vec<MNISTItem>) -> MNISTBatch<B> {
+impl<B: Backend> Batcher<MnistItem, MnistBatch<B>> for MnistBatcher<B> {
+    fn batch(&self, items: Vec<MnistItem>) -> MnistBatch<B> {
         let images = items
             .iter()
             .map(|item| Data::<f32, 2>::from(item.image))
@@ -71,7 +71,7 @@ impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
         let images = Tensor::cat(images, 0).to_device(&self.device);
         let targets = Tensor::cat(targets, 0).to_device(&self.device);
 
-        MNISTBatch { images, targets }
+        MnistBatch { images, targets }
     }
 }
 ```
@@ -81,7 +81,7 @@ impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
 
 The iterator pattern allows you to perform some tasks on a sequence of items in turn.
 
-In this example, an iterator is created over the `MNISTItem`s in the vector `items` by calling the
+In this example, an iterator is created over the `MnistItem`s in the vector `items` by calling the
 `iter` method.
 
 _Iterator adaptors_ are methods defined on the `Iterator` trait that produce different iterators by
@@ -100,7 +100,7 @@ If we go back to the example, we can break down and comment the expression used
 images.
 
 ```rust, ignore
-let images = items                                                       // take items Vec<MNISTItem>
+let images = items                                                       // take items Vec<MnistItem>
     .iter()                                                              // create an iterator over it
     .map(|item| Data::<f32, 2>::from(item.image))                        // for each item, convert the image to float32 data struct
     .map(|data| Tensor::<B, 2>::from_data(data.convert(), &self.device)) // for each data struct, create a tensor on the device
@@ -115,8 +115,8 @@ Book.
 
 </details><br>
 
-In the previous example, we implement the `Batcher` trait with a list of `MNISTItem` as input and a
-single `MNISTBatch` as output. The batch contains the images in the form of a 3D tensor, along with
+In the previous example, we implement the `Batcher` trait with a list of `MnistItem` as input and a
+single `MnistBatch` as output. The batch contains the images in the form of a 3D tensor, along with
 a targets tensor that contains the indexes of the correct digit class. The first step is to parse
 the image array into a `Data` struct. Burn provides the `Data` struct to encapsulate tensor storage
 information without being specific for a backend. When creating a tensor from data, we often need to
diff --git a/burn-book/src/basic-workflow/inference.md b/burn-book/src/basic-workflow/inference.md
index 40851192b..ad2a893ea 100644
--- a/burn-book/src/basic-workflow/inference.md
+++ b/burn-book/src/basic-workflow/inference.md
@@ -16,7 +16,7 @@ impl ModelConfig {
             conv1: Conv2dConfig::new([1, 8], [3, 3]).init_with(record.conv1),
             conv2: Conv2dConfig::new([8, 16], [3, 3]).init_with(record.conv2),
             pool: AdaptiveAvgPool2dConfig::new([8, 8]).init(),
-            activation: ReLU::new(),
+            activation: Relu::new(),
             linear1: LinearConfig::new(16 * 8 * 8, self.hidden_size).init_with(record.linear1),
             linear2: LinearConfig::new(self.hidden_size, self.num_classes)
                 .init_with(record.linear2),
@@ -33,7 +33,7 @@ manually. Everything is validated when loading the model with the record.
 Now let's create a simple `infer` method in a new file `src/inference.rs` which we will use to load our trained model.
 
 ```rust , ignore
-pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MNISTItem) {
+pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MnistItem) {
     let config = TrainingConfig::load(format!("{artifact_dir}/config.json"))
         .expect("Config should exist for the model");
     let record = CompactRecorder::new()
@@ -43,7 +43,7 @@ pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MNISTItem)
     let model = config.model.init_with::<B>(record);
 
     let label = item.label;
-    let batcher = MNISTBatcher::new(device);
+    let batcher = MnistBatcher::new(device);
     let batch = batcher.batch(vec![item]);
     let output = model.forward(batch.images);
     let predicted = output.argmax(1).flatten::<1>(0, 1).into_scalar();
@@ -56,6 +56,6 @@ The first step is to load the configuration of the training to fetch the correct
 configuration. Then we can fetch the record using the same recorder as we used during training.
 Finally we can init the model with the configuration and the record before sending it to the wanted
 device for inference. For simplicity we can use the same batcher used during the training to pass
-from a MNISTItem to a tensor.
+from a MnistItem to a tensor.
 
 By running the infer function, you should see the predictions of your model!
diff --git a/burn-book/src/basic-workflow/model.md b/burn-book/src/basic-workflow/model.md
index 90092a4e8..381f2a5d2 100644
--- a/burn-book/src/basic-workflow/model.md
+++ b/burn-book/src/basic-workflow/model.md
@@ -35,7 +35,7 @@ use burn::{
     nn::{
         conv::{Conv2d, Conv2dConfig},
         pool::{AdaptiveAvgPool2d, AdaptiveAvgPool2dConfig},
-        Dropout, DropoutConfig, Linear, LinearConfig, ReLU,
+        Dropout, DropoutConfig, Linear, LinearConfig, Relu,
     },
     tensor::{backend::Backend, Tensor},
 };
@@ -48,7 +48,7 @@ pub struct Model<B: Backend> {
     dropout: Dropout,
     linear1: Linear<B>,
     linear2: Linear<B>,
-    activation: ReLU,
+    activation: Relu,
 }
 ```
 
@@ -98,7 +98,7 @@ There are two major things going on in this code sample.
    pub struct MyCustomModule<B: Backend> {
        linear1: Linear<B>,
        linear2: Linear<B>,
-       activation: ReLU,
+       activation: Relu,
    }
    ```
 
@@ -178,7 +178,7 @@ impl ModelConfig {
             conv1: Conv2dConfig::new([1, 8], [3, 3]).init(device),
             conv2: Conv2dConfig::new([8, 16], [3, 3]).init(device),
             pool: AdaptiveAvgPool2dConfig::new([8, 8]).init(),
-            activation: ReLU::new(),
+            activation: Relu::new(),
             linear1: LinearConfig::new(16 * 8 * 8, self.hidden_size).init(device),
             linear2: LinearConfig::new(self.hidden_size, self.num_classes).init(device),
             dropout: DropoutConfig::new(self.dropout).init(),
diff --git a/burn-book/src/basic-workflow/training.md b/burn-book/src/basic-workflow/training.md
index afedafdc8..a015fb2b8 100644
--- a/burn-book/src/basic-workflow/training.md
+++ b/burn-book/src/basic-workflow/training.md
@@ -43,23 +43,23 @@ Moving forward, we will proceed with the implementation of both the training and
 for our model.
 
 ```rust , ignore
-impl<B: AutodiffBackend> TrainStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, batch: MNISTBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
+impl<B: AutodiffBackend> TrainStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, batch: MnistBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
         let item = self.forward_classification(batch.images, batch.targets);
 
         TrainOutput::new(self, item.loss.backward(), item)
     }
 }
 
-impl<B: Backend> ValidStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, batch: MNISTBatch<B>) -> ClassificationOutput<B> {
+impl<B: Backend> ValidStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, batch: MnistBatch<B>) -> ClassificationOutput<B> {
         self.forward_classification(batch.images, batch.targets)
     }
 }
 ```
 
 Here we define the input and output types as generic arguments in the `TrainStep` and `ValidStep`.
-We will call them `MNISTBatch` and `ClassificationOutput`. In the training step, the computation of
+We will call them `MnistBatch` and `ClassificationOutput`. In the training step, the computation of
 gradients is straightforward, necessitating a simple invocation of `backward()` on the loss. Note
 that contrary to PyTorch, gradients are not stored alongside each tensor parameter, but are rather
 returned by the backward pass, as such: `let gradients = loss.backward();`. The gradient of a
@@ -81,8 +81,8 @@ which is generic over the `Backend` trait as has been covered before. These trai
 `burn::train` and define a common `step` method that should be implemented for all structs. Since
 the trait is generic over the input and output types, the trait implementation must specify the
 concrete types used. This is where the additional type constraints appear
-`<MNISTBatch<B>, ClassificationOutput<B>>`. As we saw previously, the concrete input type for the
-batch is `MNISTBatch`, and the output of the forward pass is `ClassificationOutput`. The `step`
+`<MnistBatch<B>, ClassificationOutput<B>>`. As we saw previously, the concrete input type for the
+batch is `MnistBatch`, and the output of the forward pass is `ClassificationOutput`. The `step`
 method signature matches the concrete input and output types.
 
 For more details specific to constraints on generic types when defining methods, take a look at
@@ -118,20 +118,20 @@ pub fn train<B: AutodiffBackend>(artifact_dir: &str, config: TrainingConfig, dev
 
     B::seed(config.seed);
 
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
 
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     let learner = LearnerBuilder::new(artifact_dir)
         .metric_train_numeric(AccuracyMetric::new())
diff --git a/burn-book/src/building-blocks/module.md b/burn-book/src/building-blocks/module.md
index 1a95813f1..6a577eb74 100644
--- a/burn-book/src/building-blocks/module.md
+++ b/burn-book/src/building-blocks/module.md
@@ -160,4 +160,4 @@ Burn comes with built-in modules that you can use to build your own modules.
 | Burn API           | PyTorch Equivalent    |
 | ------------------ | --------------------- |
 | `CrossEntropyLoss` | `nn.CrossEntropyLoss` |
-| `MSELoss`          | `nn.MSELoss`          |
+| `MseLoss`          | `nn.MSELoss`          |
diff --git a/burn-book/src/custom-training-loop.md b/burn-book/src/custom-training-loop.md
index 95682987e..246088eba 100644
--- a/burn-book/src/custom-training-loop.md
+++ b/burn-book/src/custom-training-loop.md
@@ -40,21 +40,21 @@ pub fn run<B: AutodiffBackend>(device: &B::Device) {
     let mut optim = config.optimizer.init();
 
     // Create the batcher.
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     // Create the dataloaders.
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
 
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     ...
 }
@@ -140,7 +140,7 @@ Note that after each epoch, we include a validation loop to assess our model's p
 previously unseen data. To disable gradient tracking during this validation step, we can invoke
 `model.valid()`, which provides a model on the inner backend without autodiff capabilities. It's
 important to emphasize that we've declared our validation batcher to be on the inner backend,
-specifically `MNISTBatcher<B::InnerBackend>`; not using `model.valid()` will result in a compilation
+specifically `MnistBatcher<B::InnerBackend>`; not using `model.valid()` will result in a compilation
 error.
 
 You can find the code above available as an
@@ -195,7 +195,7 @@ where
     M: AutodiffModule<B>,
     O: Optimizer<M, B>,
 {
-    pub fn step(&mut self, _batch: MNISTBatch<B>) {
+    pub fn step(&mut self, _batch: MnistBatch<B>) {
         //
     }
 }
@@ -214,7 +214,7 @@ the backend and add your trait constraint within its definition:
 ```rust, ignore
 #[allow(dead_code)]
 impl<M, O> Learner2<M, O> {
-    pub fn step<B: AutodiffBackend>(&mut self, _batch: MNISTBatch<B>)
+    pub fn step<B: AutodiffBackend>(&mut self, _batch: MnistBatch<B>)
     where
         B: AutodiffBackend,
         M: AutodiffModule<B>,
diff --git a/burn-book/src/saving-and-loading.md b/burn-book/src/saving-and-loading.md
index 5b784ab50..d3fdf3b7d 100644
--- a/burn-book/src/saving-and-loading.md
+++ b/burn-book/src/saving-and-loading.md
@@ -44,7 +44,7 @@ model definition as a simple example.
 pub struct Model<B: Backend> {
     linear_in: Linear<B>,
     linear_out: Linear<B>,
-    activation: ReLU,
+    activation: Relu,
 }
 ```
 
@@ -59,7 +59,7 @@ impl<B: Backend> Model<B> {
         Model {
             linear_in: LinearConfig::new(10, 64).init_with(record.linear_in),
             linear_out: LinearConfig::new(64, 2).init_with(record.linear_out),
-            activation: ReLU::new(),
+            activation: Relu::new(),
         }
     }
 
@@ -70,7 +70,7 @@ impl<B: Backend> Model<B> {
         Model {
             linear_in: l1,
             linear_out: l2,
-            activation: ReLU::new(),
+            activation: Relu::new(),
         }
     }
 }
diff --git a/burn-core/src/nn/loss/mse.rs b/burn-core/src/nn/loss/mse.rs
index e99e69c37..00945d004 100644
--- a/burn-core/src/nn/loss/mse.rs
+++ b/burn-core/src/nn/loss/mse.rs
@@ -5,17 +5,17 @@ use burn_tensor::{backend::Backend, Tensor};
 
 /// Calculate the mean squared error loss from the input logits and the targets.
 #[derive(Clone, Debug)]
-pub struct MSELoss<B: Backend> {
+pub struct MseLoss<B: Backend> {
     backend: PhantomData<B>,
 }
 
-impl<B: Backend> Default for MSELoss<B> {
+impl<B: Backend> Default for MseLoss<B> {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl<B: Backend> MSELoss<B> {
+impl<B: Backend> MseLoss<B> {
     /// Create the criterion.
     pub fn new() -> Self {
         Self {
@@ -67,7 +67,7 @@ mod tests {
         let targets =
             Tensor::<TestBackend, 2>::from_data(Data::from([[2.0, 1.0], [3.0, 2.0]]), &device);
 
-        let mse = MSELoss::new();
+        let mse = MseLoss::new();
         let loss_no_reduction = mse.forward_no_reduction(logits.clone(), targets.clone());
         let loss = mse.forward(logits.clone(), targets.clone(), Reduction::Auto);
         let loss_sum = mse.forward(logits, targets, Reduction::Sum);
diff --git a/burn-core/src/nn/relu.rs b/burn-core/src/nn/relu.rs
index 92e260c7e..bd8c92a4d 100644
--- a/burn-core/src/nn/relu.rs
+++ b/burn-core/src/nn/relu.rs
@@ -8,9 +8,9 @@ use crate::tensor::Tensor;
 ///
 /// `y = max(0, x)`
 #[derive(Module, Clone, Debug, Default)]
-pub struct ReLU {}
+pub struct Relu {}
 
-impl ReLU {
+impl Relu {
     /// Create the module.
     pub fn new() -> Self {
         Self {}
diff --git a/burn-core/src/optim/adagrad.rs b/burn-core/src/optim/adagrad.rs
index ab528c211..20edc3291 100644
--- a/burn-core/src/optim/adagrad.rs
+++ b/burn-core/src/optim/adagrad.rs
@@ -27,14 +27,14 @@ pub struct AdaGradConfig {
 
 /// AdaGrad optimizer
 pub struct AdaGrad<B: Backend> {
-    lr_decay: LRDecay,
+    lr_decay: LrDecay,
     weight_decay: Option<WeightDecay<B>>,
 }
 
 /// AdaGrad state.
 #[derive(Record, Clone, new)]
 pub struct AdaGradState<B: Backend, const D: usize> {
-    lr_decay: LRDecayState<B, D>,
+    lr_decay: LrDecayState<B, D>,
 }
 
 impl<B: Backend> SimpleOptimizer<B> for AdaGrad<B> {
@@ -81,7 +81,7 @@ impl AdaGradConfig {
     /// Returns an optimizer that can be used to optimize a module.
     pub fn init<B: AutodiffBackend, M: AutodiffModule<B>>(&self) -> impl Optimizer<M, B> {
         let optim = AdaGrad {
-            lr_decay: LRDecay {
+            lr_decay: LrDecay {
                 lr_decay: self.lr_decay,
                 epsilon: self.epsilon,
             },
@@ -98,29 +98,29 @@ impl AdaGradConfig {
 
 /// Learning rate decay state (also includes sum state).
 #[derive(Record, new, Clone)]
-pub struct LRDecayState<B: Backend, const D: usize> {
+pub struct LrDecayState<B: Backend, const D: usize> {
     time: usize,
     sum: Tensor<B, D>,
 }
 
-struct LRDecay {
+struct LrDecay {
     lr_decay: f64,
     epsilon: f32,
 }
 
-impl LRDecay {
+impl LrDecay {
     pub fn transform<B: Backend, const D: usize>(
         &self,
         grad: Tensor<B, D>,
         lr: LearningRate,
-        lr_decay_state: Option<LRDecayState<B, D>>,
-    ) -> (Tensor<B, D>, LRDecayState<B, D>) {
+        lr_decay_state: Option<LrDecayState<B, D>>,
+    ) -> (Tensor<B, D>, LrDecayState<B, D>) {
         let state = if let Some(mut state) = lr_decay_state {
             state.sum = state.sum.add(grad.clone().powf_scalar(2.));
             state.time += 1;
             state
         } else {
-            LRDecayState::new(1, grad.clone().powf_scalar(2.))
+            LrDecayState::new(1, grad.clone().powf_scalar(2.))
         };
 
         let new_lr = lr / (1. + (state.time as f64 - 1.) * self.lr_decay);
@@ -133,7 +133,7 @@ impl LRDecay {
     }
 }
 
-impl<B: Backend, const D: usize> LRDecayState<B, D> {
+impl<B: Backend, const D: usize> LrDecayState<B, D> {
     /// Move state to device.
     ///
     /// # Arguments
@@ -278,7 +278,7 @@ mod tests {
     {
         let config = AdaGradConfig::new();
         AdaGrad {
-            lr_decay: LRDecay {
+            lr_decay: LrDecay {
                 lr_decay: config.lr_decay,
                 epsilon: config.epsilon,
             },
diff --git a/burn-core/src/optim/rmsprop.rs b/burn-core/src/optim/rmsprop.rs
index 11b36ea94..fd6d2e683 100644
--- a/burn-core/src/optim/rmsprop.rs
+++ b/burn-core/src/optim/rmsprop.rs
@@ -12,19 +12,19 @@ use crate::optim::adaptor::OptimizerAdaptor;
 use crate::tensor::{backend::AutodiffBackend, Tensor};
 use burn_tensor::backend::Backend;
 
-/// Configuration to create the [RMSProp](RMSProp) optimizer.
+/// Configuration to create the [RmsProp](RmsProp) optimizer.
 #[derive(Config)]
-pub struct RMSPropConfig {
+pub struct RmsPropConfig {
     /// Smoothing constant.
     #[config(default = 0.99)]
     alpha: f32,
-    /// momentum for RMSProp.
+    /// momentum for RmsProp.
     #[config(default = 0.9)]
     momentum: f32,
     /// A value required for numerical stability.
     #[config(default = 1e-5)]
     epsilon: f32,
-    /// if True, compute the centered RMSProp, the gradient is normalized by an estimation of its variance
+    /// if True, compute the centered RmsProp, the gradient is normalized by an estimation of its variance
     #[config(default = false)]
     centered: bool,
     /// [Weight decay](WeightDecayConfig) config.
@@ -33,22 +33,22 @@ pub struct RMSPropConfig {
     grad_clipping: Option<GradientClippingConfig>,
 }
 
-impl RMSPropConfig {
-    /// Initialize RMSProp optimizer.
+impl RmsPropConfig {
+    /// Initialize RmsProp optimizer.
     ///
     /// # Returns
     ///
     /// Returns an optimizer that can be used to optimize a module.
     pub fn init<B: AutodiffBackend, M: AutodiffModule<B>>(
         &self,
-    ) -> OptimizerAdaptor<RMSProp<B::InnerBackend>, M, B> {
+    ) -> OptimizerAdaptor<RmsProp<B::InnerBackend>, M, B> {
         let weight_decay = self.weight_decay.as_ref().map(WeightDecay::new);
 
-        let mut optim = OptimizerAdaptor::from(RMSProp {
+        let mut optim = OptimizerAdaptor::from(RmsProp {
             alpha: self.alpha,
             centered: self.centered,
             weight_decay,
-            momentum: RMSPropMomentum {
+            momentum: RmsPropMomentum {
                 momentum: self.momentum,
                 epsilon: self.epsilon,
             },
@@ -63,18 +63,18 @@ impl RMSPropConfig {
 }
 
 /// Optimizer that implements stochastic gradient descent with momentum.
-/// The optimizer can be configured with [RMSPropConfig](RMSPropConfig).
-pub struct RMSProp<B: Backend> {
+/// The optimizer can be configured with [RmsPropConfig](RmsPropConfig).
+pub struct RmsProp<B: Backend> {
     alpha: f32,
     // epsilon: f32,
     centered: bool,
     // momentum: Option<Momentum<B>>,
-    momentum: RMSPropMomentum,
+    momentum: RmsPropMomentum,
     weight_decay: Option<WeightDecay<B>>,
 }
 
-impl<B: Backend> SimpleOptimizer<B> for RMSProp<B> {
-    type State<const D: usize> = RMSPropState<B, D>;
+impl<B: Backend> SimpleOptimizer<B> for RmsProp<B> {
+    type State<const D: usize> = RmsPropState<B, D>;
 
     fn step<const D: usize>(
         &self,
@@ -117,7 +117,7 @@ impl<B: Backend> SimpleOptimizer<B> for RMSProp<B> {
                 .transform(grad, state_centered, state_momentum);
 
         // transition state
-        let state = RMSPropState::new(state_square_avg, state_centered, state_momentum);
+        let state = RmsPropState::new(state_square_avg, state_centered, state_momentum);
 
         // tensor param transform
         let delta = grad.mul_scalar(lr);
@@ -135,12 +135,12 @@ impl<B: Backend> SimpleOptimizer<B> for RMSProp<B> {
     }
 }
 
-/// State of [RMSProp](RMSProp)
+/// State of [RmsProp](RmsProp)
 #[derive(Record, Clone, new)]
-pub struct RMSPropState<B: Backend, const D: usize> {
+pub struct RmsPropState<B: Backend, const D: usize> {
     square_avg: SquareAvgState<B, D>,
     centered: CenteredState<B, D>,
-    momentum: Option<RMSPropMomentumState<B, D>>,
+    momentum: Option<RmsPropMomentumState<B, D>>,
 }
 
 /// [SquareAvgState](SquareAvgState) is to store and pass optimizer step params.
@@ -249,24 +249,24 @@ impl<B: Backend, const D: usize> CenteredState<B, D> {
     }
 }
 
-/// [RMSPropMomentum](RMSPropMomentum) is to store config status for optimizer.
-/// (, which is stored in [optimizer](RMSProp) itself and not passed in during `step()` calculation)
-pub struct RMSPropMomentum {
+/// [RmsPropMomentum](RmsPropMomentum) is to store config status for optimizer.
+/// (, which is stored in [optimizer](RmsProp) itself and not passed in during `step()` calculation)
+pub struct RmsPropMomentum {
     momentum: f32,
     epsilon: f32,
 }
 
-impl RMSPropMomentum {
-    /// transform [grad](Tensor) and [RMSPropMomentumState] to the next step
+impl RmsPropMomentum {
+    /// transform [grad](Tensor) and [RmsPropMomentumState] to the next step
     fn transform<B: Backend, const D: usize>(
         &self,
         grad: Tensor<B, D>,
         centered_state: CenteredState<B, D>,
-        momentum_state: Option<RMSPropMomentumState<B, D>>,
+        momentum_state: Option<RmsPropMomentumState<B, D>>,
     ) -> (
         Tensor<B, D>,
         CenteredState<B, D>,
-        Option<RMSPropMomentumState<B, D>>,
+        Option<RmsPropMomentumState<B, D>>,
     ) {
         let grad = grad.div(centered_state.avg.clone().sqrt().add_scalar(self.epsilon));
 
@@ -278,7 +278,7 @@ impl RMSPropMomentum {
             (
                 buf.clone(),
                 centered_state,
-                Some(RMSPropMomentumState { buf }),
+                Some(RmsPropMomentumState { buf }),
             )
         } else {
             (grad, centered_state, None)
@@ -286,13 +286,13 @@ impl RMSPropMomentum {
     }
 }
 
-/// [RMSPropMomentumState](RMSPropMomentumState) is to store and pass optimizer step params.
+/// [RmsPropMomentumState](RmsPropMomentumState) is to store and pass optimizer step params.
 #[derive(Record, Clone, new)]
-pub struct RMSPropMomentumState<B: Backend, const D: usize> {
+pub struct RmsPropMomentumState<B: Backend, const D: usize> {
     buf: Tensor<B, D>,
 }
 
-impl<B: Backend, const D: usize> RMSPropMomentumState<B, D> {
+impl<B: Backend, const D: usize> RmsPropMomentumState<B, D> {
     /// Moves the state to a device.
     ///
     /// # Arguments
@@ -378,7 +378,7 @@ mod tests {
         )
         .require_grad();
 
-        let mut optimizer = RMSPropConfig::new()
+        let mut optimizer = RmsPropConfig::new()
             .with_alpha(0.99)
             .with_epsilon(1e-8)
             .with_weight_decay(WeightDecayConfig::new(0.05).into())
@@ -453,7 +453,7 @@ mod tests {
         )
         .require_grad();
 
-        let mut optimizer = RMSPropConfig::new()
+        let mut optimizer = RmsPropConfig::new()
             .with_alpha(0.99)
             .with_epsilon(1e-8)
             .with_weight_decay(WeightDecayConfig::new(0.05).into())
@@ -529,9 +529,9 @@ mod tests {
     }
 
     fn create_rmsprop(
-    ) -> OptimizerAdaptor<RMSProp<TestBackend>, nn::Linear<TestAutodiffBackend>, TestAutodiffBackend>
+    ) -> OptimizerAdaptor<RmsProp<TestBackend>, nn::Linear<TestAutodiffBackend>, TestAutodiffBackend>
     {
-        RMSPropConfig {
+        RmsPropConfig {
             alpha: 0.99,
             epsilon: 1e-9,
             centered: false,
diff --git a/burn-dataset/src/source/huggingface/downloader.rs b/burn-dataset/src/source/huggingface/downloader.rs
index 9d3ef277d..9e0c152bd 100644
--- a/burn-dataset/src/source/huggingface/downloader.rs
+++ b/burn-dataset/src/source/huggingface/downloader.rs
@@ -49,12 +49,12 @@ pub enum ImporterError {
 ///  use serde::{Deserialize, Serialize};
 ///
 /// #[derive(Deserialize, Debug, Clone)]
-/// struct MNISTItemRaw {
+/// struct MnistItemRaw {
 ///     pub image_bytes: Vec<u8>,
 ///     pub label: usize,
 /// }
 ///
-///  let train_ds:SqliteDataset<MNISTItemRaw> = HuggingfaceDatasetLoader::new("mnist")
+///  let train_ds:SqliteDataset<MnistItemRaw> = HuggingfaceDatasetLoader::new("mnist")
 ///       .dataset("train")
 ///       .unwrap();
 pub struct HuggingfaceDatasetLoader {
diff --git a/burn-dataset/src/vision/mnist.rs b/burn-dataset/src/vision/mnist.rs
index 040442277..20173f65c 100644
--- a/burn-dataset/src/vision/mnist.rs
+++ b/burn-dataset/src/vision/mnist.rs
@@ -24,7 +24,7 @@ const HEIGHT: usize = 28;
 
 /// MNIST item.
 #[derive(Deserialize, Serialize, Debug, Clone)]
-pub struct MNISTItem {
+pub struct MnistItem {
     /// Image as a 2D array of floats.
     pub image: [[f32; WIDTH]; HEIGHT],
 
@@ -33,16 +33,16 @@ pub struct MNISTItem {
 }
 
 #[derive(Deserialize, Debug, Clone)]
-struct MNISTItemRaw {
+struct MnistItemRaw {
     pub image_bytes: Vec<u8>,
     pub label: u8,
 }
 
 struct BytesToImage;
 
-impl Mapper<MNISTItemRaw, MNISTItem> for BytesToImage {
+impl Mapper<MnistItemRaw, MnistItem> for BytesToImage {
     /// Convert a raw MNIST item (image bytes) to a MNIST item (2D array image).
-    fn map(&self, item: &MNISTItemRaw) -> MNISTItem {
+    fn map(&self, item: &MnistItemRaw) -> MnistItem {
         // Ensure the image dimensions are correct.
         debug_assert_eq!(item.image_bytes.len(), WIDTH * HEIGHT);
 
@@ -54,25 +54,25 @@ impl Mapper<MNISTItemRaw, MNISTItem> for BytesToImage {
             image_array[y][x] = *pixel as f32;
         }
 
-        MNISTItem {
+        MnistItem {
             image: image_array,
             label: item.label,
         }
     }
 }
 
-type MappedDataset = MapperDataset<InMemDataset<MNISTItemRaw>, BytesToImage, MNISTItemRaw>;
+type MappedDataset = MapperDataset<InMemDataset<MnistItemRaw>, BytesToImage, MnistItemRaw>;
 
 /// The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000
 /// images per class. There are 60,000 training images and 10,000 test images.
 ///
 /// The data is downloaded from the web from the [CVDF mirror](https://github.com/cvdfoundation/mnist).
-pub struct MNISTDataset {
+pub struct MnistDataset {
     dataset: MappedDataset,
 }
 
-impl Dataset<MNISTItem> for MNISTDataset {
-    fn get(&self, index: usize) -> Option<MNISTItem> {
+impl Dataset<MnistItem> for MnistDataset {
+    fn get(&self, index: usize) -> Option<MnistItem> {
         self.dataset.get(index)
     }
 
@@ -81,7 +81,7 @@ impl Dataset<MNISTItem> for MNISTDataset {
     }
 }
 
-impl MNISTDataset {
+impl MnistDataset {
     /// Creates a new train dataset.
     pub fn train() -> Self {
         Self::new("train")
@@ -94,19 +94,19 @@ impl MNISTDataset {
 
     fn new(split: &str) -> Self {
         // Download dataset
-        let root = MNISTDataset::download(split);
+        let root = MnistDataset::download(split);
 
         // MNIST is tiny so we can load it in-memory
         // Train images (u8): 28 * 28 * 60000 = 47.04Mb
         // Test images (u8): 28 * 28 * 10000 = 7.84Mb
-        let images = MNISTDataset::read_images(&root, split);
-        let labels = MNISTDataset::read_labels(&root, split);
+        let images = MnistDataset::read_images(&root, split);
+        let labels = MnistDataset::read_labels(&root, split);
 
-        // Collect as vector of MNISTItemRaw
+        // Collect as vector of MnistItemRaw
         let items: Vec<_> = images
             .into_iter()
             .zip(labels)
-            .map(|(image_bytes, label)| MNISTItemRaw { image_bytes, label })
+            .map(|(image_bytes, label)| MnistItemRaw { image_bytes, label })
             .collect();
 
         let dataset = InMemDataset::new(items);
@@ -132,12 +132,12 @@ impl MNISTDataset {
         // Download split files
         match split {
             "train" => {
-                MNISTDataset::download_file(TRAIN_IMAGES, &split_dir);
-                MNISTDataset::download_file(TRAIN_LABELS, &split_dir);
+                MnistDataset::download_file(TRAIN_IMAGES, &split_dir);
+                MnistDataset::download_file(TRAIN_LABELS, &split_dir);
             }
             "test" => {
-                MNISTDataset::download_file(TEST_IMAGES, &split_dir);
-                MNISTDataset::download_file(TEST_LABELS, &split_dir);
+                MnistDataset::download_file(TEST_IMAGES, &split_dir);
+                MnistDataset::download_file(TEST_LABELS, &split_dir);
             }
             _ => panic!("Invalid split specified {}", split),
         };
diff --git a/burn-import/pytorch-tests/tests/linear/mod.rs b/burn-import/pytorch-tests/tests/linear/mod.rs
index 2864d7b0e..7dfa05e82 100644
--- a/burn-import/pytorch-tests/tests/linear/mod.rs
+++ b/burn-import/pytorch-tests/tests/linear/mod.rs
@@ -1,6 +1,6 @@
 use burn::{
     module::Module,
-    nn::{Linear, LinearConfig, ReLU},
+    nn::{Linear, LinearConfig, Relu},
     tensor::{backend::Backend, Tensor},
 };
 
@@ -8,7 +8,7 @@ use burn::{
 pub struct Net<B: Backend> {
     fc1: Linear<B>,
     fc2: Linear<B>,
-    relu: ReLU,
+    relu: Relu,
 }
 
 impl<B: Backend> Net<B> {
@@ -16,7 +16,7 @@ impl<B: Backend> Net<B> {
     pub fn new_with(record: NetRecord<B>) -> Self {
         let fc1 = LinearConfig::new(2, 3).init_with(record.fc1);
         let fc2 = LinearConfig::new(3, 4).init_with(record.fc2);
-        let relu = ReLU::default();
+        let relu = Relu::default();
 
         Self { fc1, fc2, relu }
     }
diff --git a/burn-import/src/onnx/from_onnx.rs b/burn-import/src/onnx/from_onnx.rs
index 940aba72a..9c3e9564a 100644
--- a/burn-import/src/onnx/from_onnx.rs
+++ b/burn-import/src/onnx/from_onnx.rs
@@ -9,7 +9,7 @@ use crate::onnx::{
     proto_conversion::convert_node_proto,
 };
 
-use super::ir::{ArgType, Argument, Node, NodeType, ONNXGraph, Tensor};
+use super::ir::{ArgType, Argument, Node, NodeType, OnnxGraph, Tensor};
 use super::protos::{ModelProto, TensorProto};
 use super::{dim_inference::dim_inference, protos::ValueInfoProto};
 
@@ -33,14 +33,14 @@ const LIFT_CONSTANTS_FOR_NODE_TYPES: [NodeType; 7] = [
 ///
 /// # Returns
 ///
-/// * `ONNXGraph` - The graph representation of the onnx file
+/// * `OnnxGraph` - The graph representation of the onnx file
 ///
 /// # Panics
 ///
 /// * If the file cannot be opened
 /// * If the file cannot be parsed
 /// * If the nodes are not topologically sorted
-pub fn parse_onnx(onnx_path: &Path) -> ONNXGraph {
+pub fn parse_onnx(onnx_path: &Path) -> OnnxGraph {
     log::info!("Parsing ONNX file: {}", onnx_path.display());
 
     // Open the file
@@ -118,7 +118,7 @@ pub fn parse_onnx(onnx_path: &Path) -> ONNXGraph {
 
     log::info!("Finished parsing ONNX file: {}", onnx_path.display());
 
-    ONNXGraph {
+    OnnxGraph {
         nodes,
         inputs,
         outputs,
diff --git a/burn-import/src/onnx/ir.rs b/burn-import/src/onnx/ir.rs
index 65f03bc10..be27a70dd 100644
--- a/burn-import/src/onnx/ir.rs
+++ b/burn-import/src/onnx/ir.rs
@@ -129,7 +129,7 @@ pub enum Data {
 
 /// ONNX graph representation
 #[derive(Debug, Clone)]
-pub struct ONNXGraph {
+pub struct OnnxGraph {
     /// The nodes of the graph.
     pub nodes: Vec<Node>,
 
diff --git a/burn-import/src/onnx/mod.rs b/burn-import/src/onnx/mod.rs
index b1f1da971..b0d14549f 100644
--- a/burn-import/src/onnx/mod.rs
+++ b/burn-import/src/onnx/mod.rs
@@ -11,4 +11,4 @@ mod to_burn;
 pub use to_burn::*;
 
 pub use from_onnx::parse_onnx;
-pub use ir::ONNXGraph;
+pub use ir::OnnxGraph;
diff --git a/burn-import/src/onnx/to_burn.rs b/burn-import/src/onnx/to_burn.rs
index b0fbfb556..bd17bf1cd 100644
--- a/burn-import/src/onnx/to_burn.rs
+++ b/burn-import/src/onnx/to_burn.rs
@@ -45,7 +45,7 @@ use crate::{
 
 use super::{
     from_onnx::parse_onnx,
-    ir::{self, ArgType, Argument, Data, ElementType, ONNXGraph},
+    ir::{self, ArgType, Argument, Data, ElementType, OnnxGraph},
     op_configuration::{
         avg_pool2d_config, clip_config, concat_config, dropout_config, reshape_config,
         softmax_config,
@@ -218,7 +218,7 @@ impl ModelGen {
     }
 }
 
-impl ONNXGraph {
+impl OnnxGraph {
     /// Converts ONNX graph to Burn graph.
     pub fn into_burn<PS: PrecisionSettings + 'static>(self) -> BurnGraph<PS> {
         let mut graph = BurnGraph::<PS>::default();
diff --git a/burn-no-std-tests/src/mlp.rs b/burn-no-std-tests/src/mlp.rs
index 619f67676..db13e0614 100644
--- a/burn-no-std-tests/src/mlp.rs
+++ b/burn-no-std-tests/src/mlp.rs
@@ -28,7 +28,7 @@ pub struct MlpConfig {
 pub struct Mlp<B: Backend> {
     linears: Vec<nn::Linear<B>>,
     dropout: nn::Dropout,
-    activation: nn::ReLU,
+    activation: nn::Relu,
 }
 
 impl<B: Backend> Mlp<B> {
@@ -43,7 +43,7 @@ impl<B: Backend> Mlp<B> {
         Self {
             linears,
             dropout: nn::DropoutConfig::new(0.3).init(),
-            activation: nn::ReLU::new(),
+            activation: nn::Relu::new(),
         }
     }
 
diff --git a/burn-train/src/metric/cuda.rs b/burn-train/src/metric/cuda.rs
index e69e11ffc..37d131fdf 100644
--- a/burn-train/src/metric/cuda.rs
+++ b/burn-train/src/metric/cuda.rs
@@ -3,11 +3,11 @@ use crate::metric::{Metric, MetricEntry};
 use nvml_wrapper::Nvml;
 
 /// Track basic cuda infos.
-pub struct CUDAMetric {
+pub struct CudaMetric {
     nvml: Option<Nvml>,
 }
 
-impl CUDAMetric {
+impl CudaMetric {
     /// Creates a new metric for CUDA.
     pub fn new() -> Self {
         Self {
@@ -19,7 +19,7 @@ impl CUDAMetric {
     }
 }
 
-impl Default for CUDAMetric {
+impl Default for CudaMetric {
     fn default() -> Self {
         Self::new()
     }
@@ -29,7 +29,7 @@ impl<T> Adaptor<()> for T {
     fn adapt(&self) {}
 }
 
-impl Metric for CUDAMetric {
+impl Metric for CudaMetric {
     const NAME: &'static str = "CUDA Stats";
 
     type Input = ();
diff --git a/burn/src/lib.rs b/burn/src/lib.rs
index 64cad0331..b59716e76 100644
--- a/burn/src/lib.rs
+++ b/burn/src/lib.rs
@@ -61,7 +61,7 @@
 //!   - `audio`: Enables audio datasets (SpeechCommandsDataset)
 //!   - `sqlite`: Stores datasets in SQLite database
 //!   - `sqlite_bundled`: Use bundled version of SQLite
-//!   - `vision`: Enables vision datasets (MNISTDataset)
+//!   - `vision`: Enables vision datasets (MnistDataset)
 //! - Backends
 //!   - `wgpu`: Makes available the WGPU backend
 //!   - `candle`: Makes available the Candle backend
diff --git a/examples/custom-image-dataset/src/model.rs b/examples/custom-image-dataset/src/model.rs
index 8de1e7608..7abc450db 100644
--- a/examples/custom-image-dataset/src/model.rs
+++ b/examples/custom-image-dataset/src/model.rs
@@ -3,7 +3,7 @@ use burn::{
     nn::{
         conv::{Conv2d, Conv2dConfig},
         pool::{MaxPool2d, MaxPool2dConfig},
-        Dropout, DropoutConfig, Linear, LinearConfig, PaddingConfig2d, ReLU,
+        Dropout, DropoutConfig, Linear, LinearConfig, PaddingConfig2d, Relu,
     },
     tensor::{backend::Backend, Device, Tensor},
 };
@@ -23,8 +23,8 @@ use burn::{
 // │       maxpool      │
 // └────────────────────┘
 #[derive(Module, Debug)]
-pub struct CNN<B: Backend> {
-    activation: ReLU,
+pub struct Cnn<B: Backend> {
+    activation: Relu,
     dropout: Dropout,
     pool: MaxPool2d,
     conv1: Conv2d<B>,
@@ -37,7 +37,7 @@ pub struct CNN<B: Backend> {
     fc2: Linear<B>,
 }
 
-impl<B: Backend> CNN<B> {
+impl<B: Backend> Cnn<B> {
     pub fn new(num_classes: usize, device: &Device<B>) -> Self {
         let conv1 = Conv2dConfig::new([3, 32], [3, 3])
             .with_padding(PaddingConfig2d::Same)
@@ -68,7 +68,7 @@ impl<B: Backend> CNN<B> {
         let dropout = DropoutConfig::new(0.3).init();
 
         Self {
-            activation: ReLU::new(),
+            activation: Relu::new(),
             dropout,
             pool,
             conv1,
diff --git a/examples/custom-image-dataset/src/training.rs b/examples/custom-image-dataset/src/training.rs
index 5f012377f..11bfdf522 100644
--- a/examples/custom-image-dataset/src/training.rs
+++ b/examples/custom-image-dataset/src/training.rs
@@ -3,7 +3,7 @@ use std::time::Instant;
 use crate::{
     data::{ClassificationBatch, ClassificationBatcher},
     dataset::CIFAR10Loader,
-    model::CNN,
+    model::Cnn,
 };
 use burn::data::{dataloader::DataLoaderBuilder, dataset::vision::ImageFolderDataset};
 use burn::train::{
@@ -26,7 +26,7 @@ use burn::{
 const NUM_CLASSES: u8 = 10;
 const ARTIFACT_DIR: &str = "/tmp/custom-image-dataset";
 
-impl<B: Backend> CNN<B> {
+impl<B: Backend> Cnn<B> {
     pub fn forward_classification(
         &self,
         images: Tensor<B, 4>,
@@ -41,7 +41,7 @@ impl<B: Backend> CNN<B> {
     }
 }
 
-impl<B: AutodiffBackend> TrainStep<ClassificationBatch<B>, ClassificationOutput<B>> for CNN<B> {
+impl<B: AutodiffBackend> TrainStep<ClassificationBatch<B>, ClassificationOutput<B>> for Cnn<B> {
     fn step(&self, batch: ClassificationBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
         let item = self.forward_classification(batch.images, batch.targets);
 
@@ -49,7 +49,7 @@ impl<B: AutodiffBackend> TrainStep<ClassificationBatch<B>, ClassificationOutput<
     }
 }
 
-impl<B: Backend> ValidStep<ClassificationBatch<B>, ClassificationOutput<B>> for CNN<B> {
+impl<B: Backend> ValidStep<ClassificationBatch<B>, ClassificationOutput<B>> for Cnn<B> {
     fn step(&self, batch: ClassificationBatch<B>) -> ClassificationOutput<B> {
         self.forward_classification(batch.images, batch.targets)
     }
@@ -104,7 +104,7 @@ pub fn train<B: AutodiffBackend>(config: TrainingConfig, device: B::Device) {
         .devices(vec![device.clone()])
         .num_epochs(config.num_epochs)
         .build(
-            CNN::new(NUM_CLASSES.into(), &device),
+            Cnn::new(NUM_CLASSES.into(), &device),
             config.optimizer.init(),
             config.learning_rate,
         );
diff --git a/examples/custom-renderer/src/lib.rs b/examples/custom-renderer/src/lib.rs
index 30078c811..c6d84750b 100644
--- a/examples/custom-renderer/src/lib.rs
+++ b/examples/custom-renderer/src/lib.rs
@@ -1,11 +1,11 @@
-use burn::data::dataset::vision::MNISTDataset;
+use burn::data::dataset::vision::MnistDataset;
 use burn::train::renderer::{MetricState, MetricsRenderer, TrainingProgress};
 use burn::train::LearnerBuilder;
 use burn::{
     config::Config, data::dataloader::DataLoaderBuilder, optim::AdamConfig,
     tensor::backend::AutodiffBackend,
 };
-use guide::{data::MNISTBatcher, model::ModelConfig};
+use guide::{data::MnistBatcher, model::ModelConfig};
 
 #[derive(Config)]
 pub struct MnistTrainingConfig {
@@ -52,21 +52,21 @@ pub fn run<B: AutodiffBackend>(device: B::Device) {
     let optim = config.optimizer.init();
 
     // Create the batcher.
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     // Create the dataloaders.
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
 
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     // artifact dir does not need to be provided when log_to_file is false
     let builder = LearnerBuilder::new("")
diff --git a/examples/custom-training-loop/src/lib.rs b/examples/custom-training-loop/src/lib.rs
index cfb702015..ea9183920 100644
--- a/examples/custom-training-loop/src/lib.rs
+++ b/examples/custom-training-loop/src/lib.rs
@@ -1,6 +1,6 @@
 use std::marker::PhantomData;
 
-use burn::data::dataset::vision::MNISTDataset;
+use burn::data::dataset::vision::MnistDataset;
 use burn::{
     config::Config,
     data::dataloader::DataLoaderBuilder,
@@ -13,7 +13,7 @@ use burn::{
     },
 };
 use guide::{
-    data::{MNISTBatch, MNISTBatcher},
+    data::{MnistBatch, MnistBatcher},
     model::{Model, ModelConfig},
 };
 
@@ -46,21 +46,21 @@ pub fn run<B: AutodiffBackend>(device: B::Device) {
     let mut optim = config.optimizer.init();
 
     // Create the batcher.
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     // Create the dataloaders.
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
 
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     // Iterate over our training and validation loop for X epochs.
     for epoch in 1..config.num_epochs + 1 {
@@ -145,7 +145,7 @@ where
     B: AutodiffBackend,
     O: Optimizer<Model<B>, B>,
 {
-    pub fn step1(&mut self, _batch: MNISTBatch<B>) {
+    pub fn step1(&mut self, _batch: MnistBatch<B>) {
         //
     }
 }
@@ -156,14 +156,14 @@ where
     B: AutodiffBackend,
     O: Optimizer<Model<B>, B>,
 {
-    pub fn step2(&mut self, _batch: MNISTBatch<B>) {
+    pub fn step2(&mut self, _batch: MnistBatch<B>) {
         //
     }
 }
 
 #[allow(dead_code)]
 impl<M, O> Learner2<M, O> {
-    pub fn step3<B: AutodiffBackend>(&mut self, _batch: MNISTBatch<B>)
+    pub fn step3<B: AutodiffBackend>(&mut self, _batch: MnistBatch<B>)
     where
         B: AutodiffBackend,
         M: AutodiffModule<B>,
diff --git a/examples/guide/examples/guide.rs b/examples/guide/examples/guide.rs
index 9ae3ee843..99bf33028 100644
--- a/examples/guide/examples/guide.rs
+++ b/examples/guide/examples/guide.rs
@@ -18,7 +18,7 @@ fn main() {
     guide::inference::infer::<MyBackend>(
         artifact_dir,
         device,
-        burn::data::dataset::vision::MNISTDataset::test()
+        burn::data::dataset::vision::MnistDataset::test()
             .get(42)
             .unwrap(),
     );
diff --git a/examples/guide/src/data.rs b/examples/guide/src/data.rs
index 2d8015961..f14bad72e 100644
--- a/examples/guide/src/data.rs
+++ b/examples/guide/src/data.rs
@@ -1,26 +1,26 @@
 use burn::{
-    data::{dataloader::batcher::Batcher, dataset::vision::MNISTItem},
+    data::{dataloader::batcher::Batcher, dataset::vision::MnistItem},
     tensor::{backend::Backend, Data, ElementConversion, Int, Tensor},
 };
 
-pub struct MNISTBatcher<B: Backend> {
+pub struct MnistBatcher<B: Backend> {
     device: B::Device,
 }
 
-impl<B: Backend> MNISTBatcher<B> {
+impl<B: Backend> MnistBatcher<B> {
     pub fn new(device: B::Device) -> Self {
         Self { device }
     }
 }
 
 #[derive(Clone, Debug)]
-pub struct MNISTBatch<B: Backend> {
+pub struct MnistBatch<B: Backend> {
     pub images: Tensor<B, 3>,
     pub targets: Tensor<B, 1, Int>,
 }
 
-impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
-    fn batch(&self, items: Vec<MNISTItem>) -> MNISTBatch<B> {
+impl<B: Backend> Batcher<MnistItem, MnistBatch<B>> for MnistBatcher<B> {
+    fn batch(&self, items: Vec<MnistItem>) -> MnistBatch<B> {
         let images = items
             .iter()
             .map(|item| Data::<f32, 2>::from(item.image))
@@ -40,6 +40,6 @@ impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
         let images = Tensor::cat(images, 0);
         let targets = Tensor::cat(targets, 0);
 
-        MNISTBatch { images, targets }
+        MnistBatch { images, targets }
     }
 }
diff --git a/examples/guide/src/inference.rs b/examples/guide/src/inference.rs
index 80db2d864..e93ecd767 100644
--- a/examples/guide/src/inference.rs
+++ b/examples/guide/src/inference.rs
@@ -1,5 +1,5 @@
-use crate::{data::MNISTBatcher, training::TrainingConfig};
-use burn::data::dataset::vision::MNISTItem;
+use crate::{data::MnistBatcher, training::TrainingConfig};
+use burn::data::dataset::vision::MnistItem;
 use burn::{
     config::Config,
     data::dataloader::batcher::Batcher,
@@ -7,7 +7,7 @@ use burn::{
     tensor::backend::Backend,
 };
 
-pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MNISTItem) {
+pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MnistItem) {
     let config = TrainingConfig::load(format!("{artifact_dir}/config.json"))
         .expect("Config should exist for the model");
     let record = CompactRecorder::new()
@@ -17,7 +17,7 @@ pub fn infer<B: Backend>(artifact_dir: &str, device: B::Device, item: MNISTItem)
     let model = config.model.init_with::<B>(record);
 
     let label = item.label;
-    let batcher = MNISTBatcher::new(device);
+    let batcher = MnistBatcher::new(device);
     let batch = batcher.batch(vec![item]);
     let output = model.forward(batch.images);
     let predicted = output.argmax(1).flatten::<1>(0, 1).into_scalar();
diff --git a/examples/guide/src/model.rs b/examples/guide/src/model.rs
index 9db5ebd00..d0405757d 100644
--- a/examples/guide/src/model.rs
+++ b/examples/guide/src/model.rs
@@ -4,7 +4,7 @@ use burn::{
     nn::{
         conv::{Conv2d, Conv2dConfig},
         pool::{AdaptiveAvgPool2d, AdaptiveAvgPool2dConfig},
-        Dropout, DropoutConfig, Linear, LinearConfig, ReLU,
+        Dropout, DropoutConfig, Linear, LinearConfig, Relu,
     },
     tensor::{backend::Backend, Tensor},
 };
@@ -17,7 +17,7 @@ pub struct Model<B: Backend> {
     dropout: Dropout,
     linear1: Linear<B>,
     linear2: Linear<B>,
-    activation: ReLU,
+    activation: Relu,
 }
 
 #[derive(Config, Debug)]
@@ -35,7 +35,7 @@ impl ModelConfig {
             conv1: Conv2dConfig::new([1, 8], [3, 3]).init(device),
             conv2: Conv2dConfig::new([8, 16], [3, 3]).init(device),
             pool: AdaptiveAvgPool2dConfig::new([8, 8]).init(),
-            activation: ReLU::new(),
+            activation: Relu::new(),
             linear1: LinearConfig::new(16 * 8 * 8, self.hidden_size).init(device),
             linear2: LinearConfig::new(self.hidden_size, self.num_classes).init(device),
             dropout: DropoutConfig::new(self.dropout).init(),
@@ -47,7 +47,7 @@ impl ModelConfig {
             conv1: Conv2dConfig::new([1, 8], [3, 3]).init_with(record.conv1),
             conv2: Conv2dConfig::new([8, 16], [3, 3]).init_with(record.conv2),
             pool: AdaptiveAvgPool2dConfig::new([8, 8]).init(),
-            activation: ReLU::new(),
+            activation: Relu::new(),
             linear1: LinearConfig::new(16 * 8 * 8, self.hidden_size).init_with(record.linear1),
             linear2: LinearConfig::new(self.hidden_size, self.num_classes)
                 .init_with(record.linear2),
diff --git a/examples/guide/src/training.rs b/examples/guide/src/training.rs
index a0f0accb0..4ebd7e6e4 100644
--- a/examples/guide/src/training.rs
+++ b/examples/guide/src/training.rs
@@ -1,8 +1,8 @@
 use crate::{
-    data::{MNISTBatch, MNISTBatcher},
+    data::{MnistBatch, MnistBatcher},
     model::{Model, ModelConfig},
 };
-use burn::data::dataset::vision::MNISTDataset;
+use burn::data::dataset::vision::MnistDataset;
 use burn::train::{
     metric::{AccuracyMetric, LossMetric},
     ClassificationOutput, LearnerBuilder, TrainOutput, TrainStep, ValidStep,
@@ -36,16 +36,16 @@ impl<B: Backend> Model<B> {
     }
 }
 
-impl<B: AutodiffBackend> TrainStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, batch: MNISTBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
+impl<B: AutodiffBackend> TrainStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, batch: MnistBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
         let item = self.forward_classification(batch.images, batch.targets);
 
         TrainOutput::new(self, item.loss.backward(), item)
     }
 }
 
-impl<B: Backend> ValidStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, batch: MNISTBatch<B>) -> ClassificationOutput<B> {
+impl<B: Backend> ValidStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, batch: MnistBatch<B>) -> ClassificationOutput<B> {
         self.forward_classification(batch.images, batch.targets)
     }
 }
@@ -74,20 +74,20 @@ pub fn train<B: AutodiffBackend>(artifact_dir: &str, config: TrainingConfig, dev
 
     B::seed(config.seed);
 
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
 
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     let learner = LearnerBuilder::new(artifact_dir)
         .metric_train_numeric(AccuracyMetric::new())
diff --git a/examples/mnist/src/data.rs b/examples/mnist/src/data.rs
index 68fbbc2d6..03d868297 100644
--- a/examples/mnist/src/data.rs
+++ b/examples/mnist/src/data.rs
@@ -1,26 +1,26 @@
 use burn::{
-    data::{dataloader::batcher::Batcher, dataset::vision::MNISTItem},
+    data::{dataloader::batcher::Batcher, dataset::vision::MnistItem},
     tensor::{backend::Backend, Data, ElementConversion, Int, Tensor},
 };
 
-pub struct MNISTBatcher<B: Backend> {
+pub struct MnistBatcher<B: Backend> {
     device: B::Device,
 }
 
 #[derive(Clone, Debug)]
-pub struct MNISTBatch<B: Backend> {
+pub struct MnistBatch<B: Backend> {
     pub images: Tensor<B, 3>,
     pub targets: Tensor<B, 1, Int>,
 }
 
-impl<B: Backend> MNISTBatcher<B> {
+impl<B: Backend> MnistBatcher<B> {
     pub fn new(device: B::Device) -> Self {
         Self { device }
     }
 }
 
-impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
-    fn batch(&self, items: Vec<MNISTItem>) -> MNISTBatch<B> {
+impl<B: Backend> Batcher<MnistItem, MnistBatch<B>> for MnistBatcher<B> {
+    fn batch(&self, items: Vec<MnistItem>) -> MnistBatch<B> {
         let images = items
             .iter()
             .map(|item| Data::<f32, 2>::from(item.image))
@@ -45,6 +45,6 @@ impl<B: Backend> Batcher<MNISTItem, MNISTBatch<B>> for MNISTBatcher<B> {
         let images = Tensor::cat(images, 0);
         let targets = Tensor::cat(targets, 0);
 
-        MNISTBatch { images, targets }
+        MnistBatch { images, targets }
     }
 }
diff --git a/examples/mnist/src/model.rs b/examples/mnist/src/model.rs
index ff254d4bc..e6090fd84 100644
--- a/examples/mnist/src/model.rs
+++ b/examples/mnist/src/model.rs
@@ -1,4 +1,4 @@
-use crate::data::MNISTBatch;
+use crate::data::MnistBatch;
 use burn::{
     module::Module,
     nn::{self, loss::CrossEntropyLossConfig, BatchNorm, PaddingConfig2d},
@@ -73,7 +73,7 @@ impl<B: Backend> Model<B> {
         self.fc2.forward(x)
     }
 
-    pub fn forward_classification(&self, item: MNISTBatch<B>) -> ClassificationOutput<B> {
+    pub fn forward_classification(&self, item: MnistBatch<B>) -> ClassificationOutput<B> {
         let targets = item.targets;
         let output = self.forward(item.images);
         let loss = CrossEntropyLossConfig::new()
@@ -117,16 +117,16 @@ impl<B: Backend> ConvBlock<B> {
     }
 }
 
-impl<B: AutodiffBackend> TrainStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, item: MNISTBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
+impl<B: AutodiffBackend> TrainStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, item: MnistBatch<B>) -> TrainOutput<ClassificationOutput<B>> {
         let item = self.forward_classification(item);
 
         TrainOutput::new(self, item.loss.backward(), item)
     }
 }
 
-impl<B: Backend> ValidStep<MNISTBatch<B>, ClassificationOutput<B>> for Model<B> {
-    fn step(&self, item: MNISTBatch<B>) -> ClassificationOutput<B> {
+impl<B: Backend> ValidStep<MnistBatch<B>, ClassificationOutput<B>> for Model<B> {
+    fn step(&self, item: MnistBatch<B>) -> ClassificationOutput<B> {
         self.forward_classification(item)
     }
 }
diff --git a/examples/mnist/src/training.rs b/examples/mnist/src/training.rs
index c1c1c68ef..ace04d4f8 100644
--- a/examples/mnist/src/training.rs
+++ b/examples/mnist/src/training.rs
@@ -1,4 +1,4 @@
-use crate::data::MNISTBatcher;
+use crate::data::MnistBatcher;
 use crate::model::Model;
 
 use burn::module::Module;
@@ -10,7 +10,7 @@ use burn::train::metric::{CpuMemory, CpuTemperature, CpuUse};
 use burn::train::{MetricEarlyStoppingStrategy, StoppingCondition};
 use burn::{
     config::Config,
-    data::{dataloader::DataLoaderBuilder, dataset::vision::MNISTDataset},
+    data::{dataloader::DataLoaderBuilder, dataset::vision::MnistDataset},
     tensor::backend::AutodiffBackend,
     train::{
         metric::{AccuracyMetric, LossMetric},
@@ -44,19 +44,19 @@ pub fn run<B: AutodiffBackend>(device: B::Device) {
     B::seed(config.seed);
 
     // Data
-    let batcher_train = MNISTBatcher::<B>::new(device.clone());
-    let batcher_valid = MNISTBatcher::<B::InnerBackend>::new(device.clone());
+    let batcher_train = MnistBatcher::<B>::new(device.clone());
+    let batcher_valid = MnistBatcher::<B::InnerBackend>::new(device.clone());
 
     let dataloader_train = DataLoaderBuilder::new(batcher_train)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::train());
+        .build(MnistDataset::train());
     let dataloader_test = DataLoaderBuilder::new(batcher_valid)
         .batch_size(config.batch_size)
         .shuffle(config.seed)
         .num_workers(config.num_workers)
-        .build(MNISTDataset::test());
+        .build(MnistDataset::test());
 
     // Model
     let learner = LearnerBuilder::new(ARTIFACT_DIR)
diff --git a/examples/onnx-inference/src/bin/mnist_inference.rs b/examples/onnx-inference/src/bin/mnist_inference.rs
index f4851beea..623f197c3 100644
--- a/examples/onnx-inference/src/bin/mnist_inference.rs
+++ b/examples/onnx-inference/src/bin/mnist_inference.rs
@@ -3,7 +3,7 @@ use std::env::args;
 use burn::backend::ndarray::NdArray;
 use burn::tensor::Tensor;
 
-use burn::data::dataset::vision::MNISTDataset;
+use burn::data::dataset::vision::MnistDataset;
 use burn::data::dataset::Dataset;
 
 use onnx_inference::mnist::Model;
@@ -34,7 +34,7 @@ fn main() {
     let model: Model<Backend> = Model::default();
 
     // Load the MNIST dataset and get an item
-    let dataset = MNISTDataset::test();
+    let dataset = MnistDataset::test();
     let item = dataset.get(image_index).unwrap();
 
     // Create a tensor from the image data
diff --git a/examples/pytorch-import/src/main.rs b/examples/pytorch-import/src/main.rs
index 3a7567ce9..1dafce410 100644
--- a/examples/pytorch-import/src/main.rs
+++ b/examples/pytorch-import/src/main.rs
@@ -5,7 +5,7 @@ use burn::backend::ndarray::NdArray;
 use burn::record::{FullPrecisionSettings, NamedMpkFileRecorder, Recorder};
 use burn::tensor::Tensor;
 
-use burn::data::dataset::vision::MNISTDataset;
+use burn::data::dataset::vision::MnistDataset;
 use burn::data::dataset::Dataset;
 
 use model::Model;
@@ -42,7 +42,7 @@ fn main() {
     let model: Model<Backend> = Model::new_with(record);
 
     // Load the MNIST dataset and get an item
-    let dataset = MNISTDataset::test();
+    let dataset = MnistDataset::test();
     let item = dataset.get(image_index).unwrap();
 
     // Create a tensor from the image data
diff --git a/examples/simple-regression/src/model.rs b/examples/simple-regression/src/model.rs
index 05ff944ee..244072401 100644
--- a/examples/simple-regression/src/model.rs
+++ b/examples/simple-regression/src/model.rs
@@ -1,10 +1,10 @@
 use crate::dataset::DiabetesBatch;
 use burn::config::Config;
 use burn::nn::loss::Reduction::Mean;
-use burn::nn::ReLU;
+use burn::nn::Relu;
 use burn::{
     module::Module,
-    nn::{loss::MSELoss, Linear, LinearConfig},
+    nn::{loss::MseLoss, Linear, LinearConfig},
     tensor::{
         backend::{AutodiffBackend, Backend},
         Tensor,
@@ -16,7 +16,7 @@ use burn::{
 pub struct RegressionModel<B: Backend> {
     input_layer: Linear<B>,
     output_layer: Linear<B>,
-    activation: ReLU,
+    activation: Relu,
 }
 
 #[derive(Config)]
@@ -39,7 +39,7 @@ impl RegressionModelConfig {
         RegressionModel {
             input_layer,
             output_layer,
-            activation: ReLU::new(),
+            activation: Relu::new(),
         }
     }
 }
@@ -56,7 +56,7 @@ impl<B: Backend> RegressionModel<B> {
         let targets: Tensor<B, 2> = item.targets.unsqueeze();
         let output: Tensor<B, 2> = self.forward(item.inputs);
 
-        let loss = MSELoss::new().forward(output.clone(), targets.clone(), Mean);
+        let loss = MseLoss::new().forward(output.clone(), targets.clone(), Mean);
 
         RegressionOutput {
             loss,
diff --git a/examples/text-classification/src/training.rs b/examples/text-classification/src/training.rs
index 8a101847b..f0a279693 100644
--- a/examples/text-classification/src/training.rs
+++ b/examples/text-classification/src/training.rs
@@ -19,7 +19,7 @@ use burn::{
     record::{CompactRecorder, Recorder},
     tensor::backend::AutodiffBackend,
     train::{
-        metric::{AccuracyMetric, CUDAMetric, LearningRateMetric, LossMetric},
+        metric::{AccuracyMetric, CudaMetric, LearningRateMetric, LossMetric},
         LearnerBuilder,
     },
 };
@@ -91,8 +91,8 @@ pub fn train<B: AutodiffBackend, D: TextClassificationDataset + 'static>(
 
     // Initialize learner
     let learner = LearnerBuilder::new(artifact_dir)
-        .metric_train(CUDAMetric::new())
-        .metric_valid(CUDAMetric::new())
+        .metric_train(CudaMetric::new())
+        .metric_valid(CudaMetric::new())
         .metric_train_numeric(AccuracyMetric::new())
         .metric_valid_numeric(AccuracyMetric::new())
         .metric_train_numeric(LossMetric::new())
diff --git a/examples/text-generation/src/training.rs b/examples/text-generation/src/training.rs
index d437960ad..f7f444348 100644
--- a/examples/text-generation/src/training.rs
+++ b/examples/text-generation/src/training.rs
@@ -13,7 +13,7 @@ use burn::{
     record::{CompactRecorder, DefaultRecorder, Recorder},
     tensor::backend::AutodiffBackend,
     train::{
-        metric::{AccuracyMetric, CUDAMetric, LearningRateMetric, LossMetric},
+        metric::{AccuracyMetric, CudaMetric, LearningRateMetric, LossMetric},
         LearnerBuilder,
     },
 };
@@ -68,8 +68,8 @@ pub fn train<B: AutodiffBackend, D: Dataset<TextGenerationItem> + 'static>(
         .init();
 
     let learner = LearnerBuilder::new(artifact_dir)
-        .metric_train(CUDAMetric::new())
-        .metric_valid(CUDAMetric::new())
+        .metric_train(CudaMetric::new())
+        .metric_valid(CudaMetric::new())
         .metric_train_numeric(AccuracyMetric::new().with_pad_token(tokenizer.pad_token()))
         .metric_valid_numeric(AccuracyMetric::new().with_pad_token(tokenizer.pad_token()))
         .metric_train(LossMetric::new())