@@ -812,6 +812,26 @@ pub struct Tokenizer<'a> {
812812 unescape : bool ,
813813}
814814
815+ /// Passed into [`Tokenizer::next_token`] as in some situations tokenization
816+ /// is context dependent. The separate enum is used to be able to not clone
817+ /// the previous token during [`TokenWithLocationIter`] iteration.
818+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
819+ enum PrevTokenKind {
820+ Word ,
821+ Period ,
822+ Other ,
823+ }
824+
825+ impl From < & Token > for PrevTokenKind {
826+ fn from ( value : & Token ) -> Self {
827+ match value {
828+ Token :: Word ( _) => Self :: Word ,
829+ Token :: Period => Self :: Period ,
830+ _ => Self :: Other ,
831+ }
832+ }
833+ }
834+
815835impl < ' a > Tokenizer < ' a > {
816836 /// Create a new SQL tokenizer for the specified SQL statement
817837 ///
@@ -872,6 +892,23 @@ impl<'a> Tokenizer<'a> {
872892 self
873893 }
874894
895+ /// Return an iterator over tokens
896+ pub fn iter ( & mut self ) -> TokenWithSpanIter < ' a , ' _ > {
897+ let state = State {
898+ peekable : self . query . chars ( ) . peekable ( ) ,
899+ line : 1 ,
900+ col : 1 ,
901+ } ;
902+
903+ let location = state. location ( ) ;
904+ TokenWithSpanIter {
905+ state,
906+ location,
907+ tokenizer : self ,
908+ prev_token_kind : None ,
909+ }
910+ }
911+
875912 /// Tokenize the statement and produce a vector of tokens
876913 pub fn tokenize ( & mut self ) -> Result < Vec < Token > , TokenizerError > {
877914 let twl = self . tokenize_with_location ( ) ?;
@@ -891,19 +928,8 @@ impl<'a> Tokenizer<'a> {
891928 & mut self ,
892929 buf : & mut Vec < TokenWithSpan > ,
893930 ) -> Result < ( ) , TokenizerError > {
894- let mut state = State {
895- peekable : self . query . chars ( ) . peekable ( ) ,
896- line : 1 ,
897- col : 1 ,
898- } ;
899-
900- let mut location = state. location ( ) ;
901- while let Some ( token) = self . next_token ( & mut state, buf. last ( ) . map ( |t| & t. token ) ) ? {
902- let span = location. span_to ( state. location ( ) ) ;
903-
904- buf. push ( TokenWithSpan { token, span } ) ;
905-
906- location = state. location ( ) ;
931+ for token in self . iter ( ) {
932+ buf. push ( token?) ;
907933 }
908934 Ok ( ( ) )
909935 }
@@ -938,7 +964,7 @@ impl<'a> Tokenizer<'a> {
938964 fn next_token (
939965 & self ,
940966 chars : & mut State ,
941- prev_token : Option < & Token > ,
967+ prev_token_kind : Option < PrevTokenKind > ,
942968 ) -> Result < Option < Token > , TokenizerError > {
943969 match chars. peek ( ) {
944970 Some ( & ch) => match ch {
@@ -1196,7 +1222,7 @@ impl<'a> Tokenizer<'a> {
11961222 // if the prev token is not a word, then this is not a valid sql
11971223 // word or number.
11981224 if ch == '.' && chars. peekable . clone ( ) . nth ( 1 ) == Some ( '_' ) {
1199- if let Some ( Token :: Word ( _ ) ) = prev_token {
1225+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
12001226 chars. next ( ) ;
12011227 return Ok ( Some ( Token :: Period ) ) ;
12021228 }
@@ -1240,7 +1266,7 @@ impl<'a> Tokenizer<'a> {
12401266 // we should yield the dot as a dedicated token so compound identifiers
12411267 // starting with digits can be parsed correctly.
12421268 if s == "." && self . dialect . supports_numeric_prefix ( ) {
1243- if let Some ( Token :: Word ( _ ) ) = prev_token {
1269+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
12441270 return Ok ( Some ( Token :: Period ) ) ;
12451271 }
12461272 }
@@ -1299,7 +1325,7 @@ impl<'a> Tokenizer<'a> {
12991325 s += word. as_str ( ) ;
13001326 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
13011327 }
1302- } else if prev_token == Some ( & Token :: Period ) {
1328+ } else if prev_token_kind == Some ( PrevTokenKind :: Period ) {
13031329 // If the previous token was a period, thus not belonging to a number,
13041330 // the value we have is part of an identifier.
13051331 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
@@ -2173,6 +2199,34 @@ impl<'a> Tokenizer<'a> {
21732199 }
21742200}
21752201
2202+ /// Iterator over tokens.
2203+ pub struct TokenWithSpanIter < ' a , ' b > {
2204+ state : State < ' a > ,
2205+ location : Location ,
2206+ tokenizer : & ' b mut Tokenizer < ' a > ,
2207+ prev_token_kind : Option < PrevTokenKind > ,
2208+ }
2209+
2210+ impl Iterator for TokenWithSpanIter < ' _ , ' _ > {
2211+ type Item = Result < TokenWithSpan , TokenizerError > ;
2212+
2213+ fn next ( & mut self ) -> Option < Self :: Item > {
2214+ let token = match self
2215+ . tokenizer
2216+ . next_token ( & mut self . state , self . prev_token_kind )
2217+ . transpose ( ) ?
2218+ {
2219+ Err ( err) => return Some ( Err ( err) ) ,
2220+ Ok ( token) => token,
2221+ } ;
2222+ self . prev_token_kind = Some ( PrevTokenKind :: from ( & token) ) ;
2223+ let span = self . location . span_to ( self . state . location ( ) ) ;
2224+ self . location = self . state . location ( ) ;
2225+ let token = TokenWithSpan { token, span } ;
2226+ Some ( Ok ( token) )
2227+ }
2228+ }
2229+
21762230/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772231/// Return the characters read as String, and keep the first non-matching
21782232/// char available as `chars.next()`.
@@ -2451,6 +2505,39 @@ mod tests {
24512505 compare ( expected, tokens) ;
24522506 }
24532507
2508+ #[ test]
2509+ fn tokenize_iterator_map ( ) {
2510+ let sql = String :: from ( "SELECT ?" ) ;
2511+ let dialect = GenericDialect { } ;
2512+ let mut param_num = 1 ;
2513+
2514+ let tokens = Tokenizer :: new ( & dialect, & sql)
2515+ . iter ( )
2516+ . map ( |token| {
2517+ let token = token?;
2518+ Ok ( match token. token {
2519+ Token :: Placeholder ( n) => Token :: Placeholder ( if n == "?" {
2520+ let ret = format ! ( "${}" , param_num) ;
2521+ param_num += 1 ;
2522+ ret
2523+ } else {
2524+ n
2525+ } ) ,
2526+ _ => token. token ,
2527+ } )
2528+ } )
2529+ . collect :: < Result < Vec < _ > , TokenizerError > > ( )
2530+ . unwrap ( ) ;
2531+
2532+ let expected = vec ! [
2533+ Token :: make_keyword( "SELECT" ) ,
2534+ Token :: Whitespace ( Whitespace :: Space ) ,
2535+ Token :: Placeholder ( "$1" . to_string( ) ) ,
2536+ ] ;
2537+
2538+ compare ( expected, tokens) ;
2539+ }
2540+
24542541 #[ test]
24552542 fn tokenize_select_float ( ) {
24562543 let sql = String :: from ( "SELECT .1" ) ;
0 commit comments