diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index dfd83037..c7500ed9 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -194,6 +194,10 @@ struct Args { #[arg(long)] quantized: bool, + /// Use f16 precision for all the computations rather than f32. + #[arg(long)] + f16: bool, + #[arg(long)] model_file: Option, @@ -283,7 +287,12 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let device = candle_examples::device(args.cpu)?; let config = moondream::Config::v2(); - let dtype = if device.is_cuda() && !args.quantized { + let dtype = if args.quantized { + if args.f16 { + anyhow::bail!("Quantized model does not support f16"); + } + DType::F32 + } else if device.is_cuda() || args.f16 { DType::F16 } else { DType::F32