Don't rebuild raw strings when unescaping.

Raw strings don't have escape sequences, so for them "unescaping" just means checking for invalid chars like bare CR. Which means there is no need to rebuild them one char or byte at a time while escaping, because the unescaped version will be the same. This commit removes that rebuilding. Also, the commit changes things so that "unescaping" is unconditional for raw strings and raw byte strings. That's simpler and they're rare enough that the perf effect is negligible.
2023-12-11 16:35:43 +11:00 · 2023-12-11 16:35:43 +11:00 · 4acc5e6480
parent 028b6d152e
commit 4acc5e6480
1 changed files with 30 additions and 43 deletions
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@ -77,6 +77,8 @@ impl LitKind {
                // new symbol because the string in the LitKind is different to the
                // string in the token.
                let s = symbol.as_str();
+                // Vanilla strings are so common we optimize for the common case where no chars
+                // requiring special behaviour are present.
                let symbol = if s.contains(['\\', '\r']) {
                    let mut buf = String::with_capacity(s.len());
                    let mut error = Ok(());
@ -104,27 +106,20 @@ impl LitKind {
                LitKind::Str(symbol, ast::StrStyle::Cooked)
            }
            token::StrRaw(n) => {
-                // Ditto.
-                let s = symbol.as_str();
-                let symbol =
-                    if s.contains('\r') {
-                        let mut buf = String::with_capacity(s.len());
-                        let mut error = Ok(());
-                        unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
-                            match unescaped_char {
-                                Ok(c) => buf.push(c),
-                                Err(err) => {
-                                    if err.is_fatal() {
-                                        error = Err(LitError::LexerError);
-                                    }
-                                }
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can reuse the symbol on success.
+                let mut error = Ok(());
+                unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
+                    match unescaped_char {
+                        Ok(_) => {}
+                        Err(err) => {
+                            if err.is_fatal() {
+                                error = Err(LitError::LexerError);
                            }
-                        });
-                        error?;
-                        Symbol::intern(&buf)
-                    } else {
-                        symbol
-                    };
+                        }
+                    }
+                });
+                error?;
                LitKind::Str(symbol, ast::StrStyle::Raw(n))
            }
            token::ByteStr => {
@ -143,25 +138,19 @@ impl LitKind {
                LitKind::ByteStr(buf.into(), StrStyle::Cooked)
            }
            token::ByteStrRaw(n) => {
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can convert the symbol directly to a `Lrc<u8>` on success.
                let s = symbol.as_str();
-                let bytes = if s.contains('\r') {
-                    let mut buf = Vec::with_capacity(s.len());
-                    let mut error = Ok(());
-                    unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
-                        Ok(c) => buf.push(byte_from_char(c)),
-                        Err(err) => {
-                            if err.is_fatal() {
-                                error = Err(LitError::LexerError);
-                            }
+                let mut error = Ok(());
+                unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
+                    Ok(_) => {}
+                    Err(err) => {
+                        if err.is_fatal() {
+                            error = Err(LitError::LexerError);
                        }
-                    });
-                    error?;
-                    buf
-                } else {
-                    symbol.to_string().into_bytes()
-                };
-
-                LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
+                    }
+                });
+                LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
            }
            token::CStr => {
                let s = symbol.as_str();
@ -187,18 +176,15 @@ impl LitKind {
                LitKind::CStr(buf.into(), StrStyle::Cooked)
            }
            token::CStrRaw(n) => {
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can convert the symbol directly to a `Lrc<u8>` on success.
                let s = symbol.as_str();
-                let mut buf = Vec::with_capacity(s.len());
                let mut error = Ok(());
                unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
                    Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
                        error = Err(LitError::NulInCStr(span));
                    }
-                    Ok(CStrUnit::Byte(b)) => buf.push(b),
-                    Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
-                    Ok(CStrUnit::Char(c)) => {
-                        buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
-                    }
+                    Ok(_) => {}
                    Err(err) => {
                        if err.is_fatal() {
                            error = Err(LitError::LexerError);
@ -206,6 +192,7 @@ impl LitKind {
                    }
                });
                error?;
+                let mut buf = s.to_owned().into_bytes();
                buf.push(0);
                LitKind::CStr(buf.into(), StrStyle::Raw(n))
            }