Add benchmarks for the candle-nn package (#1995)

* add benchmarks for the candle-nn package * uncomment test * format
2024-04-03 01:03:54 -04:00 · 2024-04-03 01:03:54 -04:00 · cd6b9e317c
parent 08c049def3
commit cd6b9e317c
5 changed files with 175 additions and 0 deletions
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@ -26,6 +26,7 @@ candle-metal-kernels = { workspace = true, optional = true }
 anyhow = { workspace = true }
 clap = { workspace = true }
 rand = { workspace = true }
+criterion = { workspace = true }

 [features]
 default = []
@ -33,3 +34,7 @@ accelerate = ["dep:accelerate-src", "candle/accelerate"]
 cuda = ["candle/cuda"]
 mkl = ["dep:intel-mkl-src", "candle/mkl"]
 metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"]
+
+[[bench]]
+name = "bench_main"
+harness = false
--- a/candle-nn/benches/bench_main.rs
+++ b/candle-nn/benches/bench_main.rs
@ -0,0 +1,4 @@
+mod benchmarks;
+
+use criterion::criterion_main;
+criterion_main!(benchmarks::layer_norm::benches, benchmarks::conv::benches);
--- a/candle-nn/benches/benchmarks/conv.rs
+++ b/candle-nn/benches/benchmarks/conv.rs
@ -0,0 +1,54 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle::{DType, Device, Module, Tensor};
+use candle_nn::{Conv2d, Conv2dConfig};
+use criterion::{black_box, criterion_group, Criterion};
+use std::time::Instant;
+
+const B: usize = 1;
+const C: usize = 1;
+const M: usize = 128;
+const K: usize = 128;
+const K_SIZE: usize = 3;
+
+fn run(input: Tensor, weight: Tensor, bias: Tensor, config: Conv2dConfig) {
+    Conv2d::new(weight, Some(bias), config)
+        .forward(&input)
+        .unwrap();
+}
+
+fn run_conv2d_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
+    let weight = Tensor::ones((1, 1, K_SIZE, K_SIZE), dtype, device)
+        .unwrap()
+        .to_dtype(dtype)
+        .unwrap();
+    let bias = Tensor::zeros(K, dtype, device).unwrap();
+    let input = Tensor::ones((B, C, M, K), dtype, device).unwrap();
+
+    let mut group = c.benchmark_group(device.bench_name(name));
+    group.bench_function("iter", move |b| {
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(
+                    black_box(input.clone()),
+                    black_box(weight.clone()),
+                    black_box(bias.clone()),
+                    Default::default(),
+                );
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let device = BenchDeviceHandler::new().unwrap();
+    for d in device.devices {
+        run_conv2d_benchmark(c, &d, DType::F32, "conv2d_f32");
+        run_conv2d_benchmark(c, &d, DType::F16, "conv2d_f16");
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
--- a/candle-nn/benches/benchmarks/layer_norm.rs
+++ b/candle-nn/benches/benchmarks/layer_norm.rs
@ -0,0 +1,48 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle::{DType, Device, Module, Tensor};
+use candle_nn::LayerNorm;
+use criterion::{black_box, criterion_group, Criterion};
+use std::time::Instant;
+
+fn run(input: &Tensor, weight: &Tensor, bias: &Tensor) {
+    let _ = LayerNorm::new(weight.clone(), bias.clone(), 1e-5).forward(&input);
+}
+
+const B: usize = 1;
+const M: usize = 1024;
+const K: usize = 1024;
+
+fn run_layer_norm_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
+    let elements = B * M * K;
+
+    let weight = Tensor::arange(0.0, elements as f32, device)
+        .unwrap()
+        .to_dtype(dtype)
+        .unwrap();
+    let bias = weight.ones_like().unwrap();
+    let input = weight.ones_like().unwrap();
+
+    let mut group = c.benchmark_group(device.bench_name(name));
+    group.bench_function("iter", move |b| {
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(black_box(&input), black_box(&weight), black_box(&bias));
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let device = BenchDeviceHandler::new().unwrap();
+    for d in device.devices {
+        run_layer_norm_benchmark(c, &d, DType::F32, "layer_norm_f32");
+        run_layer_norm_benchmark(c, &d, DType::BF16, "layer_norm_bf16");
+        run_layer_norm_benchmark(c, &d, DType::F16, "layer_norm_f16");
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
--- a/candle-nn/benches/benchmarks/mod.rs
+++ b/candle-nn/benches/benchmarks/mod.rs
@ -0,0 +1,64 @@
+pub(crate) mod conv;
+pub(crate) mod layer_norm;
+
+use candle::{Device, Result};
+
+pub(crate) trait BenchDevice {
+    fn sync(&self) -> Result<()>;
+
+    fn bench_name<S: Into<String>>(&self, name: S) -> String;
+}
+
+impl BenchDevice for Device {
+    fn sync(&self) -> Result<()> {
+        match self {
+            Device::Cpu => Ok(()),
+            Device::Cuda(device) => {
+                #[cfg(feature = "cuda")]
+                return Ok(device.synchronize()?);
+                #[cfg(not(feature = "cuda"))]
+                panic!("Cuda device without cuda feature enabled: {:?}", device)
+            }
+            Device::Metal(device) => {
+                #[cfg(feature = "metal")]
+                return Ok(device.wait_until_completed()?);
+                #[cfg(not(feature = "metal"))]
+                panic!("Metal device without metal feature enabled: {:?}", device)
+            }
+        }
+    }
+
+    fn bench_name<S: Into<String>>(&self, name: S) -> String {
+        match self {
+            Device::Cpu => {
+                let cpu_type = if cfg!(feature = "accelerate") {
+                    "accelerate"
+                } else if cfg!(feature = "mkl") {
+                    "mkl"
+                } else {
+                    "cpu"
+                };
+                format!("{}_{}", cpu_type, name.into())
+            }
+            Device::Cuda(_) => format!("cuda_{}", name.into()),
+            Device::Metal(_) => format!("metal_{}", name.into()),
+        }
+    }
+}
+
+struct BenchDeviceHandler {
+    devices: Vec<Device>,
+}
+
+impl BenchDeviceHandler {
+    pub fn new() -> Result<Self> {
+        let mut devices = Vec::new();
+        if cfg!(feature = "metal") {
+            devices.push(Device::new_metal(0)?);
+        } else if cfg!(feature = "cuda") {
+            devices.push(Device::new_cuda(0)?);
+        }
+        devices.push(Device::Cpu);
+        Ok(Self { devices })
+    }
+}