Document settings

This commit is contained in:
Louis Dureuil 2025-02-19 15:06:22 +01:00
parent 589bf30ec6
commit 14e1459bf5
No known key found for this signature in database
1 changed files with 205 additions and 0 deletions

View File

@ -20,58 +20,263 @@ pub struct EmbeddingSettings {
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<EmbedderSource>)]
/// The source used to provide the embeddings.
///
/// Which embedder parameters are available and mandatory is determined by the value of this setting.
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings.
///
/// # Defaults
///
/// - Defaults to `openAi`
pub source: Setting<EmbedderSource>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
/// The name of the model to use.
///
/// # Mandatory
///
/// - This parameter is mandatory for source `ollama`
///
/// # Availability
///
/// - This parameter is available for sources `openAi`, `huggingFace`, `ollama`
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings.
///
/// # Defaults
///
/// - For source `openAi`, defaults to `text-embedding-3-small`
/// - For source `huggingFace`, defaults to `BAAI/bge-base-en-v1.5`
pub model: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
/// The revision (commit SHA1) of the model to use.
///
/// If unspecified, Meilisearch picks the latest revision of the model.
///
/// # Availability
///
/// - This parameter is available for source `huggingFace`
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
///
/// # Defaults
///
/// - When `model` is set to default, defaults to `617ca489d9e86b49b8167676d8220688b99db36e`
/// - Otherwise, defaults to `null`
pub revision: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<OverridePooling>)]
/// The pooling method to use.
///
/// # Availability
///
/// - This parameter is available for source `huggingFace`
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
///
/// # Defaults
///
/// - Defaults to `useModel`
///
/// # Compatibility Note
///
/// - Embedders created before this parameter was available default to `forceMean` to preserve the existing behavior.
pub pooling: Setting<OverridePooling>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
/// The API key to pass to the remote embedder while making requests.
///
/// # Availability
///
/// - This parameter is available for source `openAi`, `ollama`, `rest`
///
/// # 🔄 Reindexing
///
/// - 🌱 Changing the value of this parameter never regenerates embeddings
///
/// # Defaults
///
/// - For source `openAi`, the key is read from `OPENAI_API_KEY`, then `MEILI_OPENAI_API_KEY`.
/// - For other sources, no bearer token is sent if this parameter is not set.
///
/// # Note
///
/// - This setting is partially hidden when returned by the settings
pub api_key: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
/// The expected dimensions of the embeddings produced by this embedder.
///
/// # Mandatory
///
/// - This parameter is mandatory for source `userProvided`
///
/// # Availability
///
/// - This parameter is available for source `openAi`, `ollama`, `rest`, `userProvided`
///
/// # 🔄 Reindexing
///
/// - 🏗️ When the source is `openAi`, changing the value of this parameter always regenerates embeddings
/// - 🌱 For other sources, changing the value of this parameter never regenerates embeddings
///
/// # Defaults
///
/// - For source `openAi`, the dimensions is the maximum allowed by the model.
/// - For sources `ollama` and `rest`, the dimensions are inferred by embedding a sample text.
pub dimensions: Setting<usize>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<bool>)]
/// Whether to binary quantize the embeddings of this embedder.
///
/// Binary quantized embeddings are smaller than regular embeddings, which improves
/// disk usage and retrieval speed, at the cost of relevancy.
///
/// # Availability
///
/// - This parameter is available for all embedders
///
/// # 🔄 Reindexing
///
/// - 🏗️ When set to `true`, embeddings are not regenerated, but they are binary quantized, which takes time.
///
/// # Defaults
///
/// - Defaults to `false`
///
/// # Note
///
/// As binary quantization is a destructive operation, it is not possible to disable again this setting after
/// first enabling it. If you are unsure of whether the performance-relevancy tradeoff is right for you,
/// we recommend to use this parameter on a test index first.
pub binary_quantized: Setting<bool>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<bool>)]
/// A liquid template used to render documents to a text that can be embedded.
///
/// Meillisearch interpolates the template for each document and sends the resulting text to the embedder.
/// The embedder then generates document vectors based on this text.
///
/// # Availability
///
/// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest
///
/// # 🔄 Reindexing
///
/// - 🏗️ When modified, embeddings are regenerated for documents whose rendering through the template produces a different text.
pub document_template: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<usize>)]
/// Rendered texts are truncated to this size.
///
/// # Availability
///
/// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest`
///
/// # 🔄 Reindexing
///
/// - 🏗️ When increased, embeddings are regenerated for documents whose rendering through the template produces a different text.
/// - 🌱 When decreased, embeddings are never regenerated
///
/// # Default
///
/// - Defaults to 400
pub document_template_max_bytes: Setting<usize>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
/// URL to reach the remote embedder.
///
/// # Mandatory
///
/// - This parameter is mandatory for source `rest`
///
/// # Availability
///
/// - This parameter is available for source `openAi`, `ollama` and `rest`
///
/// # 🔄 Reindexing
///
/// - 🌱 When modified for source `openAi`, embeddings are never regenerated
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
pub url: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)]
/// Template request to send to the remote embedder.
///
/// # Mandatory
///
/// - This parameter is mandatory for source `rest`
///
/// # Availability
///
/// - This parameter is available for source `rest`
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
pub request: Setting<serde_json::Value>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)]
/// Template response indicating how to find the embeddings in the response from the remote embedder.
///
/// # Mandatory
///
/// - This parameter is mandatory for source `rest`
///
/// # Availability
///
/// - This parameter is available for source `rest`
///
/// # 🔄 Reindexing
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
pub response: Setting<serde_json::Value>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, String>>)]
/// Additional headers to send to the remote embedder.
///
/// # Availability
///
/// - This parameter is available for source `rest`
///
/// # 🔄 Reindexing
///
/// - 🌱 Changing the value of this parameter never regenerates embeddings
pub headers: Setting<BTreeMap<String, String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<DistributionShift>)]
/// Affine transformation applied to the semantic score to make it more comparable to the ranking score.
///
/// # Availability
///
/// - This parameter is available for all embedders
///
/// # 🔄 Reindexing
///
/// - 🌱 Changing the value of this parameter never regenerates embeddings
pub distribution: Setting<DistributionShift>,
}