From ef2fa3b4ca1fb712cd4287928879787b5ff850d5 Mon Sep 17 00:00:00 2001 From: fereidani Date: Fri, 5 Dec 2025 23:11:56 +0330 Subject: [PATCH 1/2] optimize lexer for faster lexing --- compiler/rustc_lexer/src/cursor.rs | 71 +++++++++++++++++++++-- compiler/rustc_lexer/src/lib.rs | 90 +++++++++++++++++------------- 2 files changed, 118 insertions(+), 43 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 165262b82c75d..51b5642a9c1d1 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -21,6 +21,7 @@ pub struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { + #[inline] pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> { Cursor { len_remaining: input.len(), @@ -31,6 +32,7 @@ impl<'a> Cursor<'a> { } } + #[inline] pub fn as_str(&self) -> &'a str { self.chars.as_str() } @@ -53,12 +55,14 @@ impl<'a> Cursor<'a> { /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. + #[inline] pub fn first(&self) -> char { // `.next()` optimizes better than `.nth(0)` self.chars.clone().next().unwrap_or(EOF_CHAR) } /// Peeks the second symbol from the input stream without consuming it. + #[inline] pub(crate) fn second(&self) -> char { // `.next()` optimizes better than `.nth(1)` let mut iter = self.chars.clone(); @@ -67,6 +71,7 @@ impl<'a> Cursor<'a> { } /// Peeks the third symbol from the input stream without consuming it. + #[inline] pub fn third(&self) -> char { // `.next()` optimizes better than `.nth(2)` let mut iter = self.chars.clone(); @@ -76,21 +81,25 @@ impl<'a> Cursor<'a> { } /// Checks if there is nothing more to consume. + #[inline] pub(crate) fn is_eof(&self) -> bool { self.chars.as_str().is_empty() } /// Returns amount of already consumed symbols. + #[inline] pub(crate) fn pos_within_token(&self) -> u32 { (self.len_remaining - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. + #[inline] pub(crate) fn reset_pos_within_token(&mut self) { self.len_remaining = self.chars.as_str().len(); } /// Moves to the next character. + #[inline] pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; @@ -102,12 +111,38 @@ impl<'a> Cursor<'a> { Some(c) } + #[inline] + pub(crate) fn bump_if(&mut self, expected: char) -> bool { + let mut chars = self.chars.clone(); + if chars.next() == Some(expected) { + self.chars = chars; + true + } else { + false + } + } + + /// Bumps the cursor if the next character is either of the two expected characters. + #[inline] + pub(crate) fn bump_if2(&mut self, expected1: char, expected2: char) -> bool { + let mut chars = self.chars.clone(); + if let Some(c) = chars.next() + && (c == expected1 || c == expected2) + { + self.chars = chars; + return true; + } + false + } + /// Moves to a substring by a number of bytes. + #[inline] pub(crate) fn bump_bytes(&mut self, n: usize) { - self.chars = self.as_str()[n..].chars(); + self.chars = self.as_str().get(n..).unwrap_or("").chars(); } /// Eats symbols while predicate returns true or until the end of file is reached. + #[inline] pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { // It was tried making optimized version of this for eg. line comments, but // LLVM can inline all of this and compile it down to fast iteration over bytes. @@ -115,11 +150,37 @@ impl<'a> Cursor<'a> { self.bump(); } } + /// Eats characters until the given byte is found. + /// Returns true if the byte was found, false if end of file was reached. + #[inline] + pub(crate) fn eat_until(&mut self, byte: u8) -> bool { + match memchr::memchr(byte, self.as_str().as_bytes()) { + Some(index) => { + self.bump_bytes(index); + true + } + None => { + self.chars = "".chars(); + false + } + } + } - pub(crate) fn eat_until(&mut self, byte: u8) { - self.chars = match memchr::memchr(byte, self.as_str().as_bytes()) { - Some(index) => self.as_str()[index..].chars(), - None => "".chars(), + /// Eats characters until any of the given bytes is found, then consumes past it. + /// Returns the found byte if any, or None if end of file was reached. + #[inline] + pub(crate) fn eat_past2(&mut self, byte1: u8, byte2: u8) -> Option { + let bytes = self.as_str().as_bytes(); + match memchr::memchr2(byte1, byte2, bytes) { + Some(index) => { + let found = bytes[index]; + self.bump_bytes(index + 1); + Some(found) + } + None => { + self.chars = "".chars(); + None + } } } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index f6790f7ed1e96..29350253aa9c3 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -563,11 +563,30 @@ impl Cursor<'_> { self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch)); let invalid_infostring = self.first() != '\n'; - let mut found = false; - let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize); - if let Some(closing) = self.as_str().find(&nl_fence_pattern) { + #[inline] + fn find_closing_fence(s: &str, dash_count: usize) -> Option { + let bytes = s.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if let Some(newline_pos) = memchr::memchr(b'\n', &bytes[i..]) { + i += newline_pos + 1; + let start = i; + if start + dash_count <= bytes.len() { + let slice = &bytes[start..start + dash_count]; + if slice.iter().all(|&b| b == b'-') { + return Some(start + dash_count); + } + } + } else { + break; + } + } + None + } + + if let Some(closing) = find_closing_fence(self.as_str(), length_opening as usize) { // candidate found - self.bump_bytes(closing + nl_fence_pattern.len()); + self.bump_bytes(closing); // in case like // ---cargo // --- blahblah @@ -576,10 +595,7 @@ impl Cursor<'_> { // ---- // combine those stuff into this frontmatter token such that it gets detected later. self.eat_until(b'\n'); - found = true; - } - - if !found { + } else { // recovery strategy: a closing statement might have preceding whitespace/newline // but not have enough dashes to properly close. In this case, we eat until there, // and report a mismatch in the parser. @@ -656,23 +672,25 @@ impl Cursor<'_> { }; let mut depth = 1usize; - while let Some(c) = self.bump() { + while let Some(c) = self.eat_past2(b'/', b'*') { match c { - '/' if self.first() == '*' => { - self.bump(); - depth += 1; + b'/' => { + if self.bump_if('*') { + depth += 1; + } } - '*' if self.first() == '/' => { - self.bump(); - depth -= 1; - if depth == 0 { - // This block comment is closed, so for a construction like "/* */ */" - // there will be a successfully parsed block comment "/* */" - // and " */" will be processed separately. - break; + b'*' => { + if self.bump_if('/') { + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } } } - _ => (), + _ => unreachable!(), } } @@ -935,19 +953,21 @@ impl Cursor<'_> { /// if string is terminated. fn double_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '"'); - while let Some(c) = self.bump() { + while let Some(c) = self.eat_past2(b'"', b'\\') { match c { - '"' => { + b'"' => { return true; } - '\\' if self.first() == '\\' || self.first() == '"' => { - // Bump again to skip escaped character. - self.bump(); + b'\\' => { + let first = self.first(); + if first == '\\' || first == '"' { + // Bump to skip escaped character. + self.bump(); + } } - _ => (), + _ => unreachable!(), } } - // End of file reached. false } @@ -963,9 +983,8 @@ impl Cursor<'_> { debug_assert!(self.prev() != '#'); let mut n_start_hashes: u32 = 0; - while self.first() == '#' { + while self.bump_if('#') { n_start_hashes += 1; - self.bump(); } if self.first() != '"' { @@ -1025,9 +1044,8 @@ impl Cursor<'_> { // Count opening '#' symbols. let mut eaten = 0; - while self.first() == '#' { + while self.bump_if('#') { eaten += 1; - self.bump(); } let n_start_hashes = eaten; @@ -1043,9 +1061,7 @@ impl Cursor<'_> { // Skip the string contents and on each '#' character met, check if this is // a raw string termination. loop { - self.eat_until(b'"'); - - if self.is_eof() { + if !self.eat_until(b'"') { return Err(RawStrError::NoTerminator { expected: n_start_hashes, found: max_hashes, @@ -1117,9 +1133,7 @@ impl Cursor<'_> { /// and returns false otherwise. fn eat_float_exponent(&mut self) -> bool { debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - if self.first() == '-' || self.first() == '+' { - self.bump(); - } + self.bump_if2('-', '+'); self.eat_decimal_digits() } From 8ae8758d556e5a1a1cf5f12c2d3483a23567cece Mon Sep 17 00:00:00 2001 From: fereidani Date: Sun, 7 Dec 2025 19:14:50 +0330 Subject: [PATCH 2/2] refactor: remove unnecessary inline annotations and rename cursor helper function names for better readability --- compiler/rustc_lexer/src/cursor.rs | 27 +++++--------------- compiler/rustc_lexer/src/lib.rs | 40 ++++++------------------------ 2 files changed, 13 insertions(+), 54 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 51b5642a9c1d1..93352824696b5 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -21,7 +21,6 @@ pub struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - #[inline] pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> { Cursor { len_remaining: input.len(), @@ -32,7 +31,6 @@ impl<'a> Cursor<'a> { } } - #[inline] pub fn as_str(&self) -> &'a str { self.chars.as_str() } @@ -55,14 +53,12 @@ impl<'a> Cursor<'a> { /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - #[inline] pub fn first(&self) -> char { // `.next()` optimizes better than `.nth(0)` self.chars.clone().next().unwrap_or(EOF_CHAR) } /// Peeks the second symbol from the input stream without consuming it. - #[inline] pub(crate) fn second(&self) -> char { // `.next()` optimizes better than `.nth(1)` let mut iter = self.chars.clone(); @@ -71,7 +67,6 @@ impl<'a> Cursor<'a> { } /// Peeks the third symbol from the input stream without consuming it. - #[inline] pub fn third(&self) -> char { // `.next()` optimizes better than `.nth(2)` let mut iter = self.chars.clone(); @@ -81,25 +76,21 @@ impl<'a> Cursor<'a> { } /// Checks if there is nothing more to consume. - #[inline] pub(crate) fn is_eof(&self) -> bool { self.chars.as_str().is_empty() } /// Returns amount of already consumed symbols. - #[inline] pub(crate) fn pos_within_token(&self) -> u32 { (self.len_remaining - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. - #[inline] pub(crate) fn reset_pos_within_token(&mut self) { self.len_remaining = self.chars.as_str().len(); } /// Moves to the next character. - #[inline] pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; @@ -111,10 +102,9 @@ impl<'a> Cursor<'a> { Some(c) } - #[inline] - pub(crate) fn bump_if(&mut self, expected: char) -> bool { + pub(crate) fn bump_if(&mut self, byte: char) -> bool { let mut chars = self.chars.clone(); - if chars.next() == Some(expected) { + if chars.next() == Some(byte) { self.chars = chars; true } else { @@ -123,11 +113,10 @@ impl<'a> Cursor<'a> { } /// Bumps the cursor if the next character is either of the two expected characters. - #[inline] - pub(crate) fn bump_if2(&mut self, expected1: char, expected2: char) -> bool { + pub(crate) fn bump_if_either(&mut self, byte1: char, byte2: char) -> bool { let mut chars = self.chars.clone(); if let Some(c) = chars.next() - && (c == expected1 || c == expected2) + && (c == byte1 || c == byte2) { self.chars = chars; return true; @@ -136,13 +125,11 @@ impl<'a> Cursor<'a> { } /// Moves to a substring by a number of bytes. - #[inline] pub(crate) fn bump_bytes(&mut self, n: usize) { - self.chars = self.as_str().get(n..).unwrap_or("").chars(); + self.chars = self.as_str()[n..].chars(); } /// Eats symbols while predicate returns true or until the end of file is reached. - #[inline] pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { // It was tried making optimized version of this for eg. line comments, but // LLVM can inline all of this and compile it down to fast iteration over bytes. @@ -152,7 +139,6 @@ impl<'a> Cursor<'a> { } /// Eats characters until the given byte is found. /// Returns true if the byte was found, false if end of file was reached. - #[inline] pub(crate) fn eat_until(&mut self, byte: u8) -> bool { match memchr::memchr(byte, self.as_str().as_bytes()) { Some(index) => { @@ -168,8 +154,7 @@ impl<'a> Cursor<'a> { /// Eats characters until any of the given bytes is found, then consumes past it. /// Returns the found byte if any, or None if end of file was reached. - #[inline] - pub(crate) fn eat_past2(&mut self, byte1: u8, byte2: u8) -> Option { + pub(crate) fn eat_past_either(&mut self, byte1: u8, byte2: u8) -> Option { let bytes = self.as_str().as_bytes(); match memchr::memchr2(byte1, byte2, bytes) { Some(index) => { diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 29350253aa9c3..ff39b95772c6a 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -563,30 +563,10 @@ impl Cursor<'_> { self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch)); let invalid_infostring = self.first() != '\n'; - #[inline] - fn find_closing_fence(s: &str, dash_count: usize) -> Option { - let bytes = s.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if let Some(newline_pos) = memchr::memchr(b'\n', &bytes[i..]) { - i += newline_pos + 1; - let start = i; - if start + dash_count <= bytes.len() { - let slice = &bytes[start..start + dash_count]; - if slice.iter().all(|&b| b == b'-') { - return Some(start + dash_count); - } - } - } else { - break; - } - } - None - } - - if let Some(closing) = find_closing_fence(self.as_str(), length_opening as usize) { + let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize); + if let Some(closing) = self.as_str().find(&nl_fence_pattern) { // candidate found - self.bump_bytes(closing); + self.bump_bytes(closing + nl_fence_pattern.len()); // in case like // ---cargo // --- blahblah @@ -672,7 +652,7 @@ impl Cursor<'_> { }; let mut depth = 1usize; - while let Some(c) = self.eat_past2(b'/', b'*') { + while let Some(c) = self.eat_past_either(b'/', b'*') { match c { b'/' => { if self.bump_if('*') { @@ -953,18 +933,12 @@ impl Cursor<'_> { /// if string is terminated. fn double_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '"'); - while let Some(c) = self.eat_past2(b'"', b'\\') { + while let Some(c) = self.eat_past_either(b'"', b'\\') { match c { b'"' => { return true; } - b'\\' => { - let first = self.first(); - if first == '\\' || first == '"' { - // Bump to skip escaped character. - self.bump(); - } - } + b'\\' => _ = self.bump_if_either('\\', '"'), _ => unreachable!(), } } @@ -1133,7 +1107,7 @@ impl Cursor<'_> { /// and returns false otherwise. fn eat_float_exponent(&mut self) -> bool { debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - self.bump_if2('-', '+'); + self.bump_if_either('-', '+'); self.eat_decimal_digits() }