-
Notifications
You must be signed in to change notification settings - Fork 14.1k
Improve lexer performance by 5-10% overall, improve string lexer performance 15% #149689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ pub struct Cursor<'a> { | |
| pub(crate) const EOF_CHAR: char = '\0'; | ||
|
|
||
| impl<'a> Cursor<'a> { | ||
| #[inline] | ||
| pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> { | ||
| Cursor { | ||
| len_remaining: input.len(), | ||
|
|
@@ -31,6 +32,7 @@ impl<'a> Cursor<'a> { | |
| } | ||
| } | ||
|
|
||
| #[inline] | ||
| pub fn as_str(&self) -> &'a str { | ||
| self.chars.as_str() | ||
| } | ||
|
|
@@ -53,12 +55,14 @@ impl<'a> Cursor<'a> { | |
| /// If requested position doesn't exist, `EOF_CHAR` is returned. | ||
| /// However, getting `EOF_CHAR` doesn't always mean actual end of file, | ||
| /// it should be checked with `is_eof` method. | ||
| #[inline] | ||
| pub fn first(&self) -> char { | ||
| // `.next()` optimizes better than `.nth(0)` | ||
| self.chars.clone().next().unwrap_or(EOF_CHAR) | ||
| } | ||
|
|
||
| /// Peeks the second symbol from the input stream without consuming it. | ||
| #[inline] | ||
| pub(crate) fn second(&self) -> char { | ||
| // `.next()` optimizes better than `.nth(1)` | ||
| let mut iter = self.chars.clone(); | ||
|
|
@@ -67,6 +71,7 @@ impl<'a> Cursor<'a> { | |
| } | ||
|
|
||
| /// Peeks the third symbol from the input stream without consuming it. | ||
| #[inline] | ||
| pub fn third(&self) -> char { | ||
| // `.next()` optimizes better than `.nth(2)` | ||
| let mut iter = self.chars.clone(); | ||
|
|
@@ -76,21 +81,25 @@ impl<'a> Cursor<'a> { | |
| } | ||
|
|
||
| /// Checks if there is nothing more to consume. | ||
| #[inline] | ||
| pub(crate) fn is_eof(&self) -> bool { | ||
| self.chars.as_str().is_empty() | ||
| } | ||
|
|
||
| /// Returns amount of already consumed symbols. | ||
| #[inline] | ||
| pub(crate) fn pos_within_token(&self) -> u32 { | ||
| (self.len_remaining - self.chars.as_str().len()) as u32 | ||
| } | ||
|
|
||
| /// Resets the number of bytes consumed to 0. | ||
| #[inline] | ||
| pub(crate) fn reset_pos_within_token(&mut self) { | ||
| self.len_remaining = self.chars.as_str().len(); | ||
| } | ||
|
|
||
| /// Moves to the next character. | ||
| #[inline] | ||
| pub(crate) fn bump(&mut self) -> Option<char> { | ||
| let c = self.chars.next()?; | ||
|
|
||
|
|
@@ -102,24 +111,76 @@ impl<'a> Cursor<'a> { | |
| Some(c) | ||
| } | ||
|
|
||
| #[inline] | ||
| pub(crate) fn bump_if(&mut self, expected: char) -> bool { | ||
| let mut chars = self.chars.clone(); | ||
| if chars.next() == Some(expected) { | ||
| self.chars = chars; | ||
| true | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
|
|
||
| /// Bumps the cursor if the next character is either of the two expected characters. | ||
| #[inline] | ||
| pub(crate) fn bump_if2(&mut self, expected1: char, expected2: char) -> bool { | ||
| let mut chars = self.chars.clone(); | ||
| if let Some(c) = chars.next() | ||
| && (c == expected1 || c == expected2) | ||
| { | ||
| self.chars = chars; | ||
| return true; | ||
| } | ||
| false | ||
| } | ||
|
|
||
| /// Moves to a substring by a number of bytes. | ||
| #[inline] | ||
| pub(crate) fn bump_bytes(&mut self, n: usize) { | ||
| self.chars = self.as_str()[n..].chars(); | ||
| self.chars = self.as_str().get(n..).unwrap_or("").chars(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the thinking behind this change?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it removes the panic handling code generation and branching which in my experiments it is always faster even when panic doesn't happen. if it can be proven by llvm that it will never panic unwrap_or will be optimized away like the panic handling. |
||
| } | ||
|
|
||
| /// Eats symbols while predicate returns true or until the end of file is reached. | ||
| #[inline] | ||
| pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { | ||
| // It was tried making optimized version of this for eg. line comments, but | ||
| // LLVM can inline all of this and compile it down to fast iteration over bytes. | ||
| while predicate(self.first()) && !self.is_eof() { | ||
| self.bump(); | ||
| } | ||
| } | ||
| /// Eats characters until the given byte is found. | ||
| /// Returns true if the byte was found, false if end of file was reached. | ||
| #[inline] | ||
| pub(crate) fn eat_until(&mut self, byte: u8) -> bool { | ||
| match memchr::memchr(byte, self.as_str().as_bytes()) { | ||
| Some(index) => { | ||
| self.bump_bytes(index); | ||
| true | ||
| } | ||
| None => { | ||
| self.chars = "".chars(); | ||
| false | ||
| } | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn eat_until(&mut self, byte: u8) { | ||
| self.chars = match memchr::memchr(byte, self.as_str().as_bytes()) { | ||
| Some(index) => self.as_str()[index..].chars(), | ||
| None => "".chars(), | ||
| /// Eats characters until any of the given bytes is found, then consumes past it. | ||
| /// Returns the found byte if any, or None if end of file was reached. | ||
| #[inline] | ||
| pub(crate) fn eat_past2(&mut self, byte1: u8, byte2: u8) -> Option<u8> { | ||
| let bytes = self.as_str().as_bytes(); | ||
| match memchr::memchr2(byte1, byte2, bytes) { | ||
| Some(index) => { | ||
| let found = bytes[index]; | ||
| self.bump_bytes(index + 1); | ||
| Some(found) | ||
| } | ||
| None => { | ||
| self.chars = "".chars(); | ||
| None | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -563,11 +563,30 @@ impl Cursor<'_> { | |
| self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch)); | ||
| let invalid_infostring = self.first() != '\n'; | ||
|
|
||
| let mut found = false; | ||
| let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize); | ||
| if let Some(closing) = self.as_str().find(&nl_fence_pattern) { | ||
| #[inline] | ||
| fn find_closing_fence(s: &str, dash_count: usize) -> Option<usize> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Micro-optimizing frontmatter lexing doesn't seem worthwhile. It's just a tiny fraction of general lexing. |
||
| let bytes = s.as_bytes(); | ||
| let mut i = 0; | ||
| while i < bytes.len() { | ||
| if let Some(newline_pos) = memchr::memchr(b'\n', &bytes[i..]) { | ||
| i += newline_pos + 1; | ||
| let start = i; | ||
| if start + dash_count <= bytes.len() { | ||
| let slice = &bytes[start..start + dash_count]; | ||
| if slice.iter().all(|&b| b == b'-') { | ||
| return Some(start + dash_count); | ||
| } | ||
| } | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| None | ||
| } | ||
|
|
||
| if let Some(closing) = find_closing_fence(self.as_str(), length_opening as usize) { | ||
| // candidate found | ||
| self.bump_bytes(closing + nl_fence_pattern.len()); | ||
| self.bump_bytes(closing); | ||
| // in case like | ||
| // ---cargo | ||
| // --- blahblah | ||
|
|
@@ -576,10 +595,7 @@ impl Cursor<'_> { | |
| // ---- | ||
| // combine those stuff into this frontmatter token such that it gets detected later. | ||
| self.eat_until(b'\n'); | ||
| found = true; | ||
| } | ||
|
|
||
| if !found { | ||
| } else { | ||
| // recovery strategy: a closing statement might have preceding whitespace/newline | ||
| // but not have enough dashes to properly close. In this case, we eat until there, | ||
| // and report a mismatch in the parser. | ||
|
|
@@ -656,23 +672,25 @@ impl Cursor<'_> { | |
| }; | ||
|
|
||
| let mut depth = 1usize; | ||
| while let Some(c) = self.bump() { | ||
| while let Some(c) = self.eat_past2(b'/', b'*') { | ||
| match c { | ||
| '/' if self.first() == '*' => { | ||
| self.bump(); | ||
| depth += 1; | ||
| b'/' => { | ||
| if self.bump_if('*') { | ||
| depth += 1; | ||
| } | ||
| } | ||
| '*' if self.first() == '/' => { | ||
| self.bump(); | ||
| depth -= 1; | ||
| if depth == 0 { | ||
| // This block comment is closed, so for a construction like "/* */ */" | ||
| // there will be a successfully parsed block comment "/* */" | ||
| // and " */" will be processed separately. | ||
| break; | ||
| b'*' => { | ||
| if self.bump_if('/') { | ||
| depth -= 1; | ||
| if depth == 0 { | ||
| // This block comment is closed, so for a construction like "/* */ */" | ||
| // there will be a successfully parsed block comment "/* */" | ||
| // and " */" will be processed separately. | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| _ => (), | ||
| _ => unreachable!(), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -935,19 +953,21 @@ impl Cursor<'_> { | |
| /// if string is terminated. | ||
| fn double_quoted_string(&mut self) -> bool { | ||
| debug_assert!(self.prev() == '"'); | ||
| while let Some(c) = self.bump() { | ||
| while let Some(c) = self.eat_past2(b'"', b'\\') { | ||
| match c { | ||
| '"' => { | ||
| b'"' => { | ||
| return true; | ||
| } | ||
| '\\' if self.first() == '\\' || self.first() == '"' => { | ||
| // Bump again to skip escaped character. | ||
| self.bump(); | ||
| b'\\' => { | ||
| let first = self.first(); | ||
| if first == '\\' || first == '"' { | ||
| // Bump to skip escaped character. | ||
| self.bump(); | ||
| } | ||
| } | ||
| _ => (), | ||
| _ => unreachable!(), | ||
| } | ||
| } | ||
| // End of file reached. | ||
| false | ||
| } | ||
|
|
||
|
|
@@ -963,9 +983,8 @@ impl Cursor<'_> { | |
| debug_assert!(self.prev() != '#'); | ||
|
|
||
| let mut n_start_hashes: u32 = 0; | ||
| while self.first() == '#' { | ||
| while self.bump_if('#') { | ||
| n_start_hashes += 1; | ||
| self.bump(); | ||
| } | ||
|
|
||
| if self.first() != '"' { | ||
|
|
@@ -1025,9 +1044,8 @@ impl Cursor<'_> { | |
|
|
||
| // Count opening '#' symbols. | ||
| let mut eaten = 0; | ||
| while self.first() == '#' { | ||
| while self.bump_if('#') { | ||
| eaten += 1; | ||
| self.bump(); | ||
| } | ||
| let n_start_hashes = eaten; | ||
|
|
||
|
|
@@ -1043,9 +1061,7 @@ impl Cursor<'_> { | |
| // Skip the string contents and on each '#' character met, check if this is | ||
| // a raw string termination. | ||
| loop { | ||
| self.eat_until(b'"'); | ||
|
|
||
| if self.is_eof() { | ||
| if !self.eat_until(b'"') { | ||
| return Err(RawStrError::NoTerminator { | ||
| expected: n_start_hashes, | ||
| found: max_hashes, | ||
|
|
@@ -1117,9 +1133,7 @@ impl Cursor<'_> { | |
| /// and returns false otherwise. | ||
| fn eat_float_exponent(&mut self) -> bool { | ||
| debug_assert!(self.prev() == 'e' || self.prev() == 'E'); | ||
| if self.first() == '-' || self.first() == '+' { | ||
| self.bump(); | ||
| } | ||
| self.bump_if2('-', '+'); | ||
| self.eat_decimal_digits() | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would call this
bump_if_either.bump_if2makes me think thatexpected1must be followed byexpected2.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure I can rename it.