Add benchmarks for the candle-nn package (#1995)
* add benchmarks for the candle-nn package * uncomment test * format
This commit is contained in:
parent
08c049def3
commit
cd6b9e317c
|
@ -26,6 +26,7 @@ candle-metal-kernels = { workspace = true, optional = true }
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
rand = { workspace = true }
|
rand = { workspace = true }
|
||||||
|
criterion = { workspace = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
@ -33,3 +34,7 @@ accelerate = ["dep:accelerate-src", "candle/accelerate"]
|
||||||
cuda = ["candle/cuda"]
|
cuda = ["candle/cuda"]
|
||||||
mkl = ["dep:intel-mkl-src", "candle/mkl"]
|
mkl = ["dep:intel-mkl-src", "candle/mkl"]
|
||||||
metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"]
|
metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"]
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_main"
|
||||||
|
harness = false
|
|
@ -0,0 +1,4 @@
|
||||||
|
mod benchmarks;
|
||||||
|
|
||||||
|
use criterion::criterion_main;
|
||||||
|
criterion_main!(benchmarks::layer_norm::benches, benchmarks::conv::benches);
|
|
@ -0,0 +1,54 @@
|
||||||
|
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
|
||||||
|
use candle::{DType, Device, Module, Tensor};
|
||||||
|
use candle_nn::{Conv2d, Conv2dConfig};
|
||||||
|
use criterion::{black_box, criterion_group, Criterion};
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
const B: usize = 1;
|
||||||
|
const C: usize = 1;
|
||||||
|
const M: usize = 128;
|
||||||
|
const K: usize = 128;
|
||||||
|
const K_SIZE: usize = 3;
|
||||||
|
|
||||||
|
fn run(input: Tensor, weight: Tensor, bias: Tensor, config: Conv2dConfig) {
|
||||||
|
Conv2d::new(weight, Some(bias), config)
|
||||||
|
.forward(&input)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_conv2d_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
|
||||||
|
let weight = Tensor::ones((1, 1, K_SIZE, K_SIZE), dtype, device)
|
||||||
|
.unwrap()
|
||||||
|
.to_dtype(dtype)
|
||||||
|
.unwrap();
|
||||||
|
let bias = Tensor::zeros(K, dtype, device).unwrap();
|
||||||
|
let input = Tensor::ones((B, C, M, K), dtype, device).unwrap();
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group(device.bench_name(name));
|
||||||
|
group.bench_function("iter", move |b| {
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
let start = Instant::now();
|
||||||
|
for _i in 0..iters {
|
||||||
|
run(
|
||||||
|
black_box(input.clone()),
|
||||||
|
black_box(weight.clone()),
|
||||||
|
black_box(bias.clone()),
|
||||||
|
Default::default(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
device.sync().unwrap();
|
||||||
|
start.elapsed()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn criterion_benchmark(c: &mut Criterion) {
|
||||||
|
let device = BenchDeviceHandler::new().unwrap();
|
||||||
|
for d in device.devices {
|
||||||
|
run_conv2d_benchmark(c, &d, DType::F32, "conv2d_f32");
|
||||||
|
run_conv2d_benchmark(c, &d, DType::F16, "conv2d_f16");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, criterion_benchmark);
|
|
@ -0,0 +1,48 @@
|
||||||
|
use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
|
||||||
|
use candle::{DType, Device, Module, Tensor};
|
||||||
|
use candle_nn::LayerNorm;
|
||||||
|
use criterion::{black_box, criterion_group, Criterion};
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
fn run(input: &Tensor, weight: &Tensor, bias: &Tensor) {
|
||||||
|
let _ = LayerNorm::new(weight.clone(), bias.clone(), 1e-5).forward(&input);
|
||||||
|
}
|
||||||
|
|
||||||
|
const B: usize = 1;
|
||||||
|
const M: usize = 1024;
|
||||||
|
const K: usize = 1024;
|
||||||
|
|
||||||
|
fn run_layer_norm_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
|
||||||
|
let elements = B * M * K;
|
||||||
|
|
||||||
|
let weight = Tensor::arange(0.0, elements as f32, device)
|
||||||
|
.unwrap()
|
||||||
|
.to_dtype(dtype)
|
||||||
|
.unwrap();
|
||||||
|
let bias = weight.ones_like().unwrap();
|
||||||
|
let input = weight.ones_like().unwrap();
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group(device.bench_name(name));
|
||||||
|
group.bench_function("iter", move |b| {
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
let start = Instant::now();
|
||||||
|
for _i in 0..iters {
|
||||||
|
run(black_box(&input), black_box(&weight), black_box(&bias));
|
||||||
|
}
|
||||||
|
device.sync().unwrap();
|
||||||
|
start.elapsed()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn criterion_benchmark(c: &mut Criterion) {
|
||||||
|
let device = BenchDeviceHandler::new().unwrap();
|
||||||
|
for d in device.devices {
|
||||||
|
run_layer_norm_benchmark(c, &d, DType::F32, "layer_norm_f32");
|
||||||
|
run_layer_norm_benchmark(c, &d, DType::BF16, "layer_norm_bf16");
|
||||||
|
run_layer_norm_benchmark(c, &d, DType::F16, "layer_norm_f16");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, criterion_benchmark);
|
|
@ -0,0 +1,64 @@
|
||||||
|
pub(crate) mod conv;
|
||||||
|
pub(crate) mod layer_norm;
|
||||||
|
|
||||||
|
use candle::{Device, Result};
|
||||||
|
|
||||||
|
pub(crate) trait BenchDevice {
|
||||||
|
fn sync(&self) -> Result<()>;
|
||||||
|
|
||||||
|
fn bench_name<S: Into<String>>(&self, name: S) -> String;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BenchDevice for Device {
|
||||||
|
fn sync(&self) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
Device::Cpu => Ok(()),
|
||||||
|
Device::Cuda(device) => {
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
return Ok(device.synchronize()?);
|
||||||
|
#[cfg(not(feature = "cuda"))]
|
||||||
|
panic!("Cuda device without cuda feature enabled: {:?}", device)
|
||||||
|
}
|
||||||
|
Device::Metal(device) => {
|
||||||
|
#[cfg(feature = "metal")]
|
||||||
|
return Ok(device.wait_until_completed()?);
|
||||||
|
#[cfg(not(feature = "metal"))]
|
||||||
|
panic!("Metal device without metal feature enabled: {:?}", device)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_name<S: Into<String>>(&self, name: S) -> String {
|
||||||
|
match self {
|
||||||
|
Device::Cpu => {
|
||||||
|
let cpu_type = if cfg!(feature = "accelerate") {
|
||||||
|
"accelerate"
|
||||||
|
} else if cfg!(feature = "mkl") {
|
||||||
|
"mkl"
|
||||||
|
} else {
|
||||||
|
"cpu"
|
||||||
|
};
|
||||||
|
format!("{}_{}", cpu_type, name.into())
|
||||||
|
}
|
||||||
|
Device::Cuda(_) => format!("cuda_{}", name.into()),
|
||||||
|
Device::Metal(_) => format!("metal_{}", name.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BenchDeviceHandler {
|
||||||
|
devices: Vec<Device>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BenchDeviceHandler {
|
||||||
|
pub fn new() -> Result<Self> {
|
||||||
|
let mut devices = Vec::new();
|
||||||
|
if cfg!(feature = "metal") {
|
||||||
|
devices.push(Device::new_metal(0)?);
|
||||||
|
} else if cfg!(feature = "cuda") {
|
||||||
|
devices.push(Device::new_cuda(0)?);
|
||||||
|
}
|
||||||
|
devices.push(Device::Cpu);
|
||||||
|
Ok(Self { devices })
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue