lead-sheets/lexer.jai at main · Stuart-Mouse/lead-sheets · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377

/*
    The Lexer implementation here is extremely basic.
    On the one hand, it's proabbly good to keep it simple since the scripts themselves are very simple,
        but on the other I wonder if I shouldn't try to beef it up a little and learn from the Jai Lexer.
    Oh well, maybe I'll do that one day if there's a good reason to do so.

    We don't really need a lot of context here, so you can basically only look at one token at a time.
    There's no buffer of previous tokens to refer to.
    The Lexer always looks one token ahead of the current token that is returned to the user.

    get_token() will return 'next_token', and will lex another token to replace 'next_token'.
    peek_token() will return 'next_token', but will not consume the token or lex another token.

    The general usage pattern is just to peek_token() to look ahead, and then to commit consumption of the token use get_token() after we know we want to pick up the token.
    I really only use get_token() on its own when I know for sure that the next token must be of a certain type.

    There is also expect_token_type(), which is useful when one doesn't care about anything other than the token type,
        for example when expecting a closing `)` or `]` token.


    TODO: consider adding user-level lexing overrides, maybe using negative values for Token_Type to signify user tokens
          then user could add new operators by creating user-level tokens and adding those to the operator table
*/


Token_Type :: enum {
    EOF             :: 0;
    ERROR           :: 1;

    OPEN_PAREN      :: #char "(";
    CLOSE_PAREN     :: #char ")";

    OPEN_BRACE      :: #char "{";
    CLOSE_BRACE     :: #char "}";

    OPEN_BRACKET    :: #char "[";
    CLOSE_BRACKET   :: #char "]";

    QUESTION_MARK   :: #char "?";
    DOT             :: #char ".";
    COMMA           :: #char ",";
    COLON           :: #char ":";
    SEMICOLON       :: #char ";";
    // ASSIGN_EQUAL    :: #char "=";

    // BINARY_OR       :: #char "|";
    // BINARY_AND      :: #char "&";
    // BINARY_NOT      :: #char "~";
    // BINARY_XOR      :: #char "^";

    // LESS_THAN       :: #char "<";
    // GREATER_THAN    :: #char ">";

    // ADD             :: #char "+";
    // SUB             :: #char "-";
    // MUL             :: #char "*";
    // DIV             :: #char "/";

    // DOLLAR          :: #char "$";


    IDENTIFIER      :: 256;

    TRUE;
    FALSE;

    STRING;
    NUMBER;

    OPERATOR;

    // COMPARE_EQUAL;
    // LESS_THAN_OR_EQUAL_TO;
    // GREATER_THAN_OR_EQUAL_TO;

    // LOGICAL_AND;
    // LOGICAL_OR;

    // PLUS_EQUALS;
    // MINUS_EQUALS;
    // AND_EQUALS;
    // OR_EQUALS;
    // XOR_EQUALS;

    // DOUBLE_QUESTION_MARK;

    ARROW;      // ->
    SPREAD;
    DIRECTIVE;

    IF;
    WHILE;
    FOR;
    FOREACH;
}


Token :: struct {
    type:       Token_Type;
    text:       string;
    location:   Source_Code_Location;
    trivia:     string;
    error_type: Error_Type; // TODO: maybe unionize with normal result?  Seems easier than making the lexer return some Result object...
}

/*
    Lexer can be used either in a sort of active mode where tokens are only lexed as requested,
        or in a pre-lexed mode where all tokens are lexed up front.
    If input was pre-lexed, then Lexer will return sequential tokens from `tokens`,
        otherwise, it will use `token_buffer` as a ring buffer for the next few tokens.
    The interface is exactly the same either way, but we support both so that there's options
        about how things get allocated, since you will probably only want to pre-lex an entire
        input string if those tokens will be stored in the script's arena.
    It's sort of like how we have an option to either allocate nodes in the script's arena or
        using the temporary allocator, depending on the use case.
*/
Lexer :: struct {
    using #as scanner: Scanner;

    lexer_proc := lex_next_token;

    TOKEN_BUFFER_COUNT :: 4;
    token_buffer: [TOKEN_BUFFER_COUNT] Token; // ring buffer, indexed modulo TOKEN_BUFFER_COUNT
    next_token_index: int;
}


init_lexer :: (using script: *Script, file := "", location := Source_Code_Location.{ "", 1, 1 }) {
    lexer = .{};
    lexer.file = file;
    lexer.location = location;

    // lex first few tokens into token buffer
    for 0..lexer.TOKEN_BUFFER_COUNT-1 {
        lexer.token_buffer[it] = lex_next_token(script);
        if lexer.token_buffer[it].type == .ERROR
        || lexer.token_buffer[it].type == .EOF {
            break;
        }
    }
}

// consumes a token
get_token :: (script: *Script) -> Token {
    using script.lexer;

    return_token := token_buffer[next_token_index];
    if return_token.type == .ERROR
    || return_token.type == .EOF {
        return return_token;
    }

    token_buffer[next_token_index] = lexer_proc(script);
    next_token_index = (next_token_index + 1) % TOKEN_BUFFER_COUNT;

    return return_token;
}

// peeks next_token, but does not consume it
peek_token :: (script: *Script, peek_index := 0) -> Token {
    using script.lexer;
    assert(peek_index >= 0 && peek_index < TOKEN_BUFFER_COUNT-1);
    index := (next_token_index + peek_index) % TOKEN_BUFFER_COUNT;
    return token_buffer[index];
}

// consumes token if it was expected type, else it was just peeked
expect_token_type :: (script: *Script, type: Token_Type) -> bool {
    token := peek_token(script);
    if token.type == type {
        get_token(script);
        return true;
    }
    return false;
}

lex_next_token :: (script: *Script) -> Token {
    using script.lexer;

    trivia, ok := skip_whitespace_and_comments(*script.lexer);
    if !ok    return make_error_token(location, .UNEXPECTED_EOF, message = "in the middle of a comment");
    if !file  return .{ type = .EOF, location = location };

    // we grab cursor location at top so that it points to start of token
    token_location := script.lexer.location;

    // convenience macro for short tokens
    make_token :: (type: Token_Type, len: int) -> Token #expand {
        defer advance(*script.lexer, len);
        return .{
            type       = type,
            text       = slice(file, 0, len),
            location   = token_location,
            trivia     = trivia,
        };
    }

    // 2-character tokens (excluding operators)
    if file.count >= 2 {
        if slice(file, 0, 2) == {
          case ".."; return make_token(.SPREAD, 2);
          case "->"; return make_token(.ARROW, 2);
        }
    }

    // single-character tokens (excluding operators)
    if is_any(file[0], "()[]{},.:;?") {
        return make_token(xx file[0], 1);
    }

    // NOTE: this is being done before operators, presumably because a number may start with a minus sign
    //       this is probably fine, but be aware of this while doing operator refactoring
    token_text := try_lex_number(*script.lexer);
    if token_text  return .{ .NUMBER, token_text, token_location, trivia, .NO_ERROR };

    // operator tokens
    {
        best_match, best_match_len := -1;
        for script.operator_table {
            if it.token_text.count > best_match_len
            && begins_with(file, it.token_text) {
                best_match = it_index;
                best_match_len = it.token_text.count;
            }
        }
        if best_match >= 0 {
            _operator := *script.operator_table[best_match];
            return make_token(.OPERATOR, _operator.token_text.count);
        }
    }

    token_text = try_lex_identifier(*script.lexer);
    if token_text {
        type := Token_Type.IDENTIFIER;
        if token_text == {
          case "true";      type = .TRUE;
          case "false";     type = .FALSE;
          case "if";        type = .IF;
          case "for";       type = .FOR;
          case "foreach";   type = .FOREACH;
          case "while";     type = .WHILE;
        }
        return .{ type, token_text, token_location, trivia, .NO_ERROR };
    }

    // TODO: perhaps this should not be a token in itself, and should be handled on a higher level by the parser
    // parse a directive
    if file[0] == #char "#" {
        advance(*script.lexer);
        token_text = try_lex_identifier(*script.lexer);
        if token_text  return .{ .DIRECTIVE, token_text, token_location, trivia, .NO_ERROR };
        return make_error_token(token_location, .UNEXPECTED_CHARACTER, string.{1,file.data}, "while attempting to parse a directive name.");
    }

    // parse string or backticked identifier
    if file[0] == #char "\"" || file[0] == #char "`" {
        quote_char := file[0];

        if !advance(*script.lexer)  return make_error_token(token_location, .UNEXPECTED_EOF, message = "while parsing string.");
        token_text := string.{ 0, file.data };

        while file[0] != quote_char {
            if file[0] == #char "\\" {
                _, ok := parse_escape_sequence(*script.lexer);
                if !ok  return make_error_token(token_location, .UNEXPECTED_CHARACTER, "invalid escape sequence encountered while parsing string");
            } else {
                if !advance(*script.lexer)  return make_error_token(token_location, .UNEXPECTED_EOF, message = "while parsing string.");
            }
        }

        token_text.count = script.lexer.file.data - token_text.data;
        advance(*script.lexer);
        is_identifier := quote_char == #char "`";
        return .{ ifx is_identifier then .IDENTIFIER else .STRING, token_text, token_location, trivia, .NO_ERROR };
    }

    return make_error_token(token_location, .UNEXPECTED_CHARACTER, string.{1,file.data});
}

// NOTE: we use the trivia string to attach some additional context to error tokens
make_error_token :: (location: Source_Code_Location, error_type: Error_Type, text := "", message := "") -> Token {
    assert(error_type & .CLASS_MASK == .LEXER_ERROR);
    return .{ type = .ERROR, location = location, error_type = error_type, text = text, trivia = message };
}

begins_identifier :: (char: u8) -> bool { return is_alpha(char) || char == #char "_"; }
continues_identifier :: is_alnum;

// returns true if we even start to parse an identifier, not only on success
try_lex_identifier :: (using lexer: *Lexer) -> string {
    if begins_identifier(file[0]) {
        str := string.{ 1, *file[0] };
        advance(lexer);

        while file && continues_identifier(file[0]) {
            str.count += 1;
            advance(lexer);
        }
        return str;
    }
    return "";
}

is_legal_identifier :: (str: string) -> bool {
    if !str return false;
    if !begins_identifier(str[0])  return false;
    for str  if !continues_identifier(it)  return false;
    return true;
}

tokenize :: (file: string) -> ([..] Token, bool) {
    lexer: Lexer;
    init_lexer(*lexer, file);

    success := true;
    tokens: [..] Token;

    defer if !success {
        array_free(tokens);
        memset(*tokens, 0, size_of(type_of(tokens)));
    }

    while lexer.file {
        token := get_token(lexer);
        array_add(*tokens, token);
        if token.type == {
            case .EOF;    break;
            case .ERROR;  success = false; break;
        }
    }

    return tokens, success;
}

// cycles between skipping whitespace and comments until next character is neither
skip_whitespace_and_comments :: (using lexer: *Lexer) -> (trivia: string, ok: bool) {
    if file.count == 0  return "", true;

    trivia := string.{ 0, file.data };
    ok := true;

    while loop := file.count > 0 {
        while is_whitespace(file[0]) {
            if !advance(lexer)  break;
        }
        if begins_with(file, "//") {
            advance(lexer, 2);
            while file[0] != #char "\n" {
                if !advance(lexer)  break loop;
            }
            advance(lexer);
        }
        else if begins_with(file, "/*") {
            advance(lexer, 2);
            while !begins_with(file, "*/") {
                if !advance(lexer) {
                    ok = false;
                    break loop;
                }
            }
            advance(lexer, 2);
        }
        else break;
    }

    trivia.count = file.data - trivia.data;
    return trivia, ok;
}

is_whitespace :: inline (char: u8) -> bool {
  return char == #char " "
      || char == #char "\t"
      || char == #char "\r"
      || char == #char "\n";
}