Skip to content

Commit c27c2c7

Browse files
author
Альберт Скальт
committed
add iterator over tokens in Tokenizer
This patch adds an ability to iterate over recognized tokens converting `Tokenizer` to the iterator. It allows to perform token mapping with single pass, instead of mapping a resulting vector with the additional loop.
1 parent 0dbb9c9 commit c27c2c7

1 file changed

Lines changed: 104 additions & 17 deletions

File tree

src/tokenizer.rs

Lines changed: 104 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,26 @@ pub struct Tokenizer<'a> {
812812
unescape: bool,
813813
}
814814

815+
/// Passed into [`Tokenizer::next_token`] as in some situations tokenization
816+
/// is context dependent. The separate enum is used to be able to not clone
817+
/// the previous token during [`TokenWithLocationIter`] iteration.
818+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
819+
enum PrevTokenKind {
820+
Word,
821+
Period,
822+
Other,
823+
}
824+
825+
impl From<&Token> for PrevTokenKind {
826+
fn from(value: &Token) -> Self {
827+
match value {
828+
Token::Word(_) => Self::Word,
829+
Token::Period => Self::Period,
830+
_ => Self::Other,
831+
}
832+
}
833+
}
834+
815835
impl<'a> Tokenizer<'a> {
816836
/// Create a new SQL tokenizer for the specified SQL statement
817837
///
@@ -872,6 +892,23 @@ impl<'a> Tokenizer<'a> {
872892
self
873893
}
874894

895+
/// Return an iterator over tokens
896+
pub fn iter(&mut self) -> TokenWithSpanIter<'a, '_> {
897+
let state = State {
898+
peekable: self.query.chars().peekable(),
899+
line: 1,
900+
col: 1,
901+
};
902+
903+
let location = state.location();
904+
TokenWithSpanIter {
905+
state,
906+
location,
907+
tokenizer: self,
908+
prev_token_kind: None,
909+
}
910+
}
911+
875912
/// Tokenize the statement and produce a vector of tokens
876913
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877914
let twl = self.tokenize_with_location()?;
@@ -891,19 +928,8 @@ impl<'a> Tokenizer<'a> {
891928
&mut self,
892929
buf: &mut Vec<TokenWithSpan>,
893930
) -> Result<(), TokenizerError> {
894-
let mut state = State {
895-
peekable: self.query.chars().peekable(),
896-
line: 1,
897-
col: 1,
898-
};
899-
900-
let mut location = state.location();
901-
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902-
let span = location.span_to(state.location());
903-
904-
buf.push(TokenWithSpan { token, span });
905-
906-
location = state.location();
931+
for token in self.iter() {
932+
buf.push(token?);
907933
}
908934
Ok(())
909935
}
@@ -938,7 +964,7 @@ impl<'a> Tokenizer<'a> {
938964
fn next_token(
939965
&self,
940966
chars: &mut State,
941-
prev_token: Option<&Token>,
967+
prev_token_kind: Option<PrevTokenKind>,
942968
) -> Result<Option<Token>, TokenizerError> {
943969
match chars.peek() {
944970
Some(&ch) => match ch {
@@ -1196,7 +1222,7 @@ impl<'a> Tokenizer<'a> {
11961222
// if the prev token is not a word, then this is not a valid sql
11971223
// word or number.
11981224
if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199-
if let Some(Token::Word(_)) = prev_token {
1225+
if let Some(PrevTokenKind::Word) = prev_token_kind {
12001226
chars.next();
12011227
return Ok(Some(Token::Period));
12021228
}
@@ -1240,7 +1266,7 @@ impl<'a> Tokenizer<'a> {
12401266
// we should yield the dot as a dedicated token so compound identifiers
12411267
// starting with digits can be parsed correctly.
12421268
if s == "." && self.dialect.supports_numeric_prefix() {
1243-
if let Some(Token::Word(_)) = prev_token {
1269+
if let Some(PrevTokenKind::Word) = prev_token_kind {
12441270
return Ok(Some(Token::Period));
12451271
}
12461272
}
@@ -1299,7 +1325,7 @@ impl<'a> Tokenizer<'a> {
12991325
s += word.as_str();
13001326
return Ok(Some(Token::make_word(s.as_str(), None)));
13011327
}
1302-
} else if prev_token == Some(&Token::Period) {
1328+
} else if prev_token_kind == Some(PrevTokenKind::Period) {
13031329
// If the previous token was a period, thus not belonging to a number,
13041330
// the value we have is part of an identifier.
13051331
return Ok(Some(Token::make_word(s.as_str(), None)));
@@ -2173,6 +2199,34 @@ impl<'a> Tokenizer<'a> {
21732199
}
21742200
}
21752201

2202+
/// Iterator over tokens.
2203+
pub struct TokenWithSpanIter<'a, 'b> {
2204+
state: State<'a>,
2205+
location: Location,
2206+
tokenizer: &'b mut Tokenizer<'a>,
2207+
prev_token_kind: Option<PrevTokenKind>,
2208+
}
2209+
2210+
impl Iterator for TokenWithSpanIter<'_, '_> {
2211+
type Item = Result<TokenWithSpan, TokenizerError>;
2212+
2213+
fn next(&mut self) -> Option<Self::Item> {
2214+
let token = match self
2215+
.tokenizer
2216+
.next_token(&mut self.state, self.prev_token_kind)
2217+
.transpose()?
2218+
{
2219+
Err(err) => return Some(Err(err)),
2220+
Ok(token) => token,
2221+
};
2222+
self.prev_token_kind = Some(PrevTokenKind::from(&token));
2223+
let span = self.location.span_to(self.state.location());
2224+
self.location = self.state.location();
2225+
let token = TokenWithSpan { token, span };
2226+
Some(Ok(token))
2227+
}
2228+
}
2229+
21762230
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772231
/// Return the characters read as String, and keep the first non-matching
21782232
/// char available as `chars.next()`.
@@ -2451,6 +2505,39 @@ mod tests {
24512505
compare(expected, tokens);
24522506
}
24532507

2508+
#[test]
2509+
fn tokenize_iterator_map() {
2510+
let sql = String::from("SELECT ?");
2511+
let dialect = GenericDialect {};
2512+
let mut param_num = 1;
2513+
2514+
let tokens = Tokenizer::new(&dialect, &sql)
2515+
.iter()
2516+
.map(|token| {
2517+
let token = token?;
2518+
Ok(match token.token {
2519+
Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2520+
let ret = format!("${}", param_num);
2521+
param_num += 1;
2522+
ret
2523+
} else {
2524+
n
2525+
}),
2526+
_ => token.token,
2527+
})
2528+
})
2529+
.collect::<Result<Vec<_>, TokenizerError>>()
2530+
.unwrap();
2531+
2532+
let expected = vec![
2533+
Token::make_keyword("SELECT"),
2534+
Token::Whitespace(Whitespace::Space),
2535+
Token::Placeholder("$1".to_string()),
2536+
];
2537+
2538+
compare(expected, tokens);
2539+
}
2540+
24542541
#[test]
24552542
fn tokenize_select_float() {
24562543
let sql = String::from("SELECT .1");

0 commit comments

Comments
 (0)