Add RoPE `init_with_frequency_scaling` (#2194)

* Add RoPE init_with_frequency_scaling * Fix clippy
2024-08-23 10:30:23 -04:00 · 2024-08-23 10:30:23 -04:00 · 4999421f6c
parent 17de832c6e
commit 4999421f6c
1 changed files with 138 additions and 2 deletions
--- a/crates/burn-core/src/nn/rope_encoding.rs
+++ b/crates/burn-core/src/nn/rope_encoding.rs
@ -31,6 +31,35 @@ impl RotaryEncodingConfig {
    /// Panics if the size of input embedding dimension is not even.
    /// Panics if the theta parameter is not positive.
    pub fn init<B: Backend>(&self, device: &B::Device) -> RotaryEncoding<B> {
        self.initialize(None, device)
    }
    /// Initialize a new [RotaryEncoding](RotaryEncoding) module with a custom frequency scaling function.
    /// This is useful to apply different RoPE extensions.
    ///
    /// # Panics
    ///
    /// Panics if the size of input embedding dimension is not even.
    /// Panics if the theta parameter is not positive.
    pub fn init_with_frequency_scaling<B: Backend>(
        &self,
        scaling: fn(Tensor<B, 1>) -> Tensor<B, 1>,
        device: &B::Device,
    ) -> RotaryEncoding<B> {
        self.initialize(Some(scaling), device)
    }
    /// Initialize a new [RotaryEncoding](RotaryEncoding) module.
    ///
    /// # Panics
    ///
    /// Panics if the size of input embedding dimension is not even.
    /// Panics if the theta parameter is not positive.
    fn initialize<B: Backend>(
        &self,
        scaling: Option<fn(Tensor<B, 1>) -> Tensor<B, 1>>,
        device: &B::Device,
    ) -> RotaryEncoding<B> {
        assert_eq!(
            self.d_model % 2,
            0,
@ -42,7 +71,7 @@ impl RotaryEncodingConfig {
        );
        // Calculate the rotation frequencies for positional embeddings based on the formula
-        // `theta_i = 1 / (10000 ^ (2i / d_model)) for i in [0..d_model/2]`
+        // `theta_i = 1 / (theta ^ (2i / d_model)) for i in [0..d_model/2]`
        let exponent = Tensor::<B, 1, Int>::arange_step(0..self.d_model as i64, 2, device)
            .float()
            .div_scalar(self.d_model as f32);
@ -50,7 +79,11 @@ impl RotaryEncodingConfig {
        // Calculate (10000 ^ (2i / d_model)) by using the log base property `exp(log(10000) * (2i / d_model))`
        // This is done since burn doesn't support exponentiation of scalar to tensor
        let theta_i = exponent.mul_scalar(self.theta.ln()).exp();
-        let theta_i = theta_i.powf_scalar(-1.0);
+        let mut theta_i = theta_i.powf_scalar(-1.0);
        if let Some(scaling) = scaling {
            theta_i = scaling(theta_i)
        }
        // Generate frequency values for positional embeddings
        let frequencies: Tensor<B, 2> =
@ -265,6 +298,109 @@ mod tests {
        let _output = pe.forward(input);
    }
    #[test]
    fn test_rotary_encoding_frequencies() {
        let device = Default::default();
        let rotary_encoding = RotaryEncodingConfig::new(2, 8).init::<TestBackend>(&device);
        let expected_freqs = Tensor::<TestBackend, 3>::from_floats(
            [
                [
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                ],
                [
                    [5.4030e-01, 8.4147e-01],
                    [9.9500e-01, 9.9833e-02],
                    [9.9995e-01, 9.9998e-03],
                    [9.9999e-01, 9.9999e-04],
                ],
            ],
            &device,
        )
        .unsqueeze_dim::<4>(2)
        .repeat_dim(2, 2)
        .reshape([2, 8, 2]);
        rotary_encoding
            .freq_complex
            .to_data()
            .assert_approx_eq(&expected_freqs.to_data(), 4);
    }
    fn apply_freq_scaling_by_parts<B: Backend>(freqs: Tensor<B, 1>) -> Tensor<B, 1> {
        // Adapted from: https://github.com/meta-llama/llama-models/blob/main/models/llama3/reference_impl/model.py#L45
        let scale_factor = 8.;
        let low_freq_factor = 1.;
        let high_freq_factor = 4.;
        let old_context_len = 8192.;
        let low_freq_wavelen = old_context_len / low_freq_factor;
        let high_freq_wavelen = old_context_len / high_freq_factor;
        let wavelen = freqs.clone().recip().mul_scalar(2. * core::f32::consts::PI);
        // if wavelen >= high_freq_wavelen
        let cond = wavelen.clone().greater_equal_elem(high_freq_wavelen);
        let smooth = wavelen
            .clone()
            .recip()
            .mul_scalar(old_context_len)
            .sub_scalar(low_freq_factor)
            .div_scalar(high_freq_factor - low_freq_factor);
        // (1 - smooth) * freq / scale_factor + smooth * freq
        let new_freqs = smooth
            .clone()
            .neg()
            .add_scalar(1.)
            .mul(freqs.clone().div_scalar(scale_factor))
            .add(smooth.clone().mul(freqs.clone()));
        let new_freqs = freqs.clone().mask_where(cond, new_freqs);
        // if wavelen > low_freq_wavelen
        let cond = wavelen.clone().greater_elem(low_freq_wavelen);
        let new_freqs = new_freqs.mask_where(cond, freqs.clone().div_scalar(scale_factor));
        // if wavelen < high_freq_wavelen
        let cond = wavelen.lower_elem(high_freq_wavelen);
        new_freqs.mask_where(cond, freqs)
    }
    #[test]
    fn test_rotary_encoding_with_frequency_scaling() {
        let device = Default::default();
        let rotary_encoding = RotaryEncodingConfig::new(2, 8)
            .init_with_frequency_scaling::<TestBackend>(apply_freq_scaling_by_parts, &device);
        let expected_freqs = Tensor::<TestBackend, 3>::from_floats(
            [
                [
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                    [1.0000, 0.0000],
                ],
                [
                    [5.4030e-01, 8.4148e-01],
                    [9.9500e-01, 9.9833e-02],
                    [9.9995e-01, 9.9998e-03],
                    [1.0000, 2.1361e-04],
                ],
            ],
            &device,
        )
        .unsqueeze_dim::<4>(2)
        .repeat_dim(2, 2)
        .reshape([2, 8, 2]);
        rotary_encoding
            .freq_complex
            .to_data()
            .assert_approx_eq(&expected_freqs.to_data(), 4);
    }
    #[test]
    fn display() {
        let config = RotaryEncodingConfig::new(10, 4);