Add flag to run Moondream in f16 precision (#2015)

* moondream implementation

* add moondream example

* change config default activation

* Add assets and integrate phi mixformer with example

* Make use of kv cache and fix seq_len bug; Clean up example code

* Add README link to example

* Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig

* Delete image

* Use apply instead of forward

* Use latest release special token; Fix token/s accuracy; Use GeluPytorchTanh in VisionConfig v2

* Add flag to use f16

* Avoid breaking the quantized version on cuda.

---------

Co-authored-by: laurent <laurent.mazare@gmail.com>
This commit is contained in:
Santiago Medina 2024-04-04 22:03:33 -07:00 committed by GitHub
parent c87381fc96
commit ace282e5c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 10 additions and 1 deletions

View File

@ -194,6 +194,10 @@ struct Args {
#[arg(long)]
quantized: bool,
/// Use f16 precision for all the computations rather than f32.
#[arg(long)]
f16: bool,
#[arg(long)]
model_file: Option<String>,
@ -283,7 +287,12 @@ async fn main() -> anyhow::Result<()> {
let start = std::time::Instant::now();
let device = candle_examples::device(args.cpu)?;
let config = moondream::Config::v2();
let dtype = if device.is_cuda() && !args.quantized {
let dtype = if args.quantized {
if args.f16 {
anyhow::bail!("Quantized model does not support f16");
}
DType::F32
} else if device.is_cuda() || args.f16 {
DType::F16
} else {
DType::F32