From 160d64567c43307e6e0bb5f094b6c2efb5d29c75 Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Wed, 3 Dec 2025 22:22:59 +0100 Subject: [PATCH] c2cat: simplify parsing and output, check consistency * output tokens with the original spelling * use Style enum and select on token kind * add `--color` and `--nocolor` to force/disable color in output * add `-c` and `--check` to suppress output and just check syntax * show syntax errors to stderr * command line options accepted before and after filenames * non zero exit status on errors --- tools/c2cat.c2 | 503 ++++++++++++++++++++++++++++++------------------- 1 file changed, 310 insertions(+), 193 deletions(-) diff --git a/tools/c2cat.c2 b/tools/c2cat.c2 index 796f197cd..37430d2fc 100644 --- a/tools/c2cat.c2 +++ b/tools/c2cat.c2 @@ -20,8 +20,8 @@ import attr local; import color local; import file_utils; import keywords; +import src_loc local; import string_buffer; -import string_list; import string_map; import string_pool; import number_radix; @@ -31,34 +31,109 @@ import ctype; import stdio local; import stdlib local; import string local; +import unistd; -Color col_keyword = Byellow; -Color col_type = Green; -Color col_feature = Blue; -Color col_attr = Blue; -Color col_identifier = Normal; -Color col_integer = Magenta; -Color col_float = Magenta; -Color col_charconst = Magenta; -Color col_string = Magenta; -Color col_comment = Bcyan; -Color col_invalid = Bred; -Color col_error = Bred; -Color col_normal = Normal; +type Style enum u8 (const char* const name @(unused), const Color default_color) { + Normal : { "normal", Normal }, + Identifier : { "identifier", None }, + Integer : { "integer", Magenta }, + Float : { "float", Magenta }, + Charconst : { "charconst", Magenta }, + String : { "string", Magenta }, + Operator : { "operator", None }, + Type : { "type", Green }, + Keyword : { "keyword", Byellow }, + Function : { "function", White }, + Attr : { "attr", Blue }, + Feature : { "feature", Blue }, + Invalid : { "invalid", Bred }, + Comment : { "comment", Bcyan }, + Warning : { "warning", Bred }, + Error : { "error", Bred }, +} fn void usage(const char* me) { - printf("Usage: %s file.c2 ...\n", me); - exit(1); + printf("Usage: %s [options] file.c2 ...\n" + " -c --check only check token syntax, no content output\n" + " --color force colorized output\n" + " --nocolor disable colorized output\n" + , me); } type C2cat struct { string_pool.Pool* pool; string_buffer.Buf* out; + string_buffer.Buf* out2; c2_tokenizer.Tokenizer* tokenizer; AttrRegistry attr_registry; + const char* filename; const char* input; u32 offset; - u32 in_attributes; // 0 no, 1 seen @, 2 (, ) -> 0 + u32 length; + u8 in_attributes; // 0 no, 1 seen @, 2 (, ) -> 0 + bool check_only; + bool has_error; + c2_tokenizer.ErrorLevel error_level; + Style[Kind] token_style; + Color[Style] style_color; +} + +fn bool C2cat.init_colors(C2cat* ctx) { + for (Kind k = Kind.min; k <= Kind.max; k++) { + Style style = Normal; + switch (k) { + case None: + style = Normal; + break; + case Identifier: + style = Identifier; + break; + case IntegerLiteral: + style = Integer; + break; + case FloatLiteral: + style = Float; + break; + case CharLiteral: + style = Charconst; + break; + case StringLiteral: + style = String; + break; + case LParen ... GreaterGreaterEqual: + style = Operator; + break; + case KW_bool ... KW_unsigned: + style = Type; + break; + case KW_typedef ... KW_while: + if (k.isQualifier()) style = Type; + else style = Keyword; + break; + case Feat_if ... Feat_warning: + style = Feature; + break; + case Invalid: + style = Invalid; + break; + case LineComment: + case BlockComment: + style = Comment; + break; + case Eof: + style = Normal; + break; + case Error: + style = Error; + break; + } + ctx.token_style[k] = style; + } + for (Style s = Style.min; s <= Style.max; s++) { + ctx.style_color[s] = s.default_color; + } + // TODO: customize colors from environment variable, config file or command line argument + return true; } fn bool C2cat.is_attribute(C2cat* ctx, u32 name_idx) { @@ -81,165 +156,131 @@ fn void C2cat.update_state(C2cat* ctx, const Token* tok) { fn void C2cat.print_token(C2cat* ctx, const Token* tok) { string_buffer.Buf* out = ctx.out; + u32 pos = tok.loc - ctx.tokenizer.loc_start; // token start offset + u32 tok_len = tok.len; // token length in bytes - if (ctx.offset != 0) { - // copy stuff from file to out (from end of last token to start of current) - if (tok.done) return; - if (ctx.offset <= tok.loc) { - u32 len = tok.loc - ctx.offset; - if (len) out.add2(ctx.input + ctx.offset, len); - } else { - out.add1('\n'); - out.color(col_error); - out.print("error: offset=%d tok.loc=%d", ctx.offset, tok.loc); - out.color(col_normal); - out.add1('\n'); + if (pos < ctx.offset) { + // token starts before end of previous token, this is an error + ctx.print_error(Error, tok.loc, "invalid token position (offset=%d pos=%d)", ctx.offset, pos); + ctx.offset = pos; + } + if (pos > ctx.offset) { + if (!ctx.check_only) { + // copy stuff from file to out (from end of last token to start of current) + out.add2(ctx.input + ctx.offset, pos - ctx.offset); } + // check for whitespace only + for (u32 i = ctx.offset; i < pos; i++) { + if (!ctype.isspace(ctx.input[i])) { + ctx.print_error(Error, ctx.tokenizer.loc_start + i, + "non space character '\\x%2x' between tokens", + ctx.input[i] & 0xFF); + break; + } + } + ctx.offset = pos; } - if (tok.kind >= LParen && tok.kind < KW_bool) { - const char* str = tok.kind.name; - out.add(str); - ctx.offset = tok.loc + (u32)strlen(str); - return; - } - if (tok.kind.isTypeKeyword()) { - const char* str = tok.kind.name; - out.color(col_type); - out.add(str); - out.color(col_normal); - ctx.offset = tok.loc + (u32)strlen(str); - return; + Style s = ctx.token_style[tok.kind]; + if (tok.kind == Kind.Identifier) { + if (ctx.in_attributes) { + s = ctx.is_attribute(tok.name_idx) ? Attr : Invalid; + } else + if (ctx.input[ctx.offset + tok_len] == '(') { + s = Function; + } } - if (tok.kind.isQualifier()) { - const char* str = tok.kind.name; - out.color(col_type); - out.add(str); - out.color(col_normal); - ctx.offset = tok.loc + (u32)strlen(str); - return; + + if (!ctx.check_only) { + if (s && ctx.style_color[s]) out.color(ctx.style_color[s]); + out.add2(ctx.input + ctx.offset, tok_len); + if (s && ctx.style_color[s]) out.color(ctx.style_color[Normal]); } - if (tok.kind.isKeyword()) { - const char* str = tok.kind.name; - out.color(col_keyword); - out.add(str); - out.color(col_normal); - ctx.offset = tok.loc + (u32)strlen(str); - return; + ctx.offset += tok_len; +} + +type Location struct { + u32 line, column, line_start, line_length; +} + +fn bool Location.init(Location* loc, const char* src, u32 offset) { + u32 line = 0; + u32 line_start = 0; + u32 i; + for (i = 0; src[i]; i++) { + if (src[i] == '\n') { + if (i >= offset) + break; + line++; + line_start = i + 1; + } } - if (tok.kind >= Feat_if && tok.kind <= Feat_endif) { - const char* str = tok.kind.name; - out.color(col_feature); - out.add(str); - out.color(col_normal); - ctx.offset = tok.loc + (u32)strlen(str); - return; + if (i >= offset) { + loc.line = line + 1; + loc.column = offset - line_start + 1; + loc.line_start = line_start; + loc.line_length = i - line_start; + return true; } - switch (tok.kind) { - case Identifier: - const char* str = ctx.pool.idx2str(tok.name_idx); - Color col = col_identifier; + loc.line = 0; + loc.column = 0; + loc.line_start = 0; + loc.line_length = 0; + return false; +} - if (ctx.in_attributes) - col = ctx.is_attribute(tok.name_idx) ? col_attr : col_invalid; - out.color(col); - out.add(str); - out.color(col_normal); - ctx.offset = tok.loc + (u32)strlen(str); - return; - case IntegerLiteral: - out.color(col_integer); - char[64] tmp; - i32 len; - switch (tok.getRadix()) { - case Hex: - len = snprintf(tmp, elemsof(tmp), "0x%x", tok.int_value); - break; - default: - len = snprintf(tmp, elemsof(tmp), "%d", tok.int_value); - break; - } - out.add(tmp); - ctx.offset = tok.loc + len; - break; - case FloatLiteral: - out.color(col_float); - char[64] tmp; - i32 len; - switch (tok.getRadix()) { - case Hex: - len = snprintf(tmp, elemsof(tmp), "%a", tok.float_value); - break; - default: - len = snprintf(tmp, elemsof(tmp), "%#.16g", tok.float_value); - break; - } - out.add(tmp); - if (tok.suffix_F) out.add1('F'); - ctx.offset = tok.loc + len; - break; - case CharLiteral: - out.color(col_charconst); - char[64] tmp; - i32 len = 0; - switch (tok.getRadix()) { - case Hex: - len = snprintf(tmp, elemsof(tmp), "'\\x%02x'", tok.char_value); - break; - case Octal: - len = snprintf(tmp, elemsof(tmp), "'\\%o'", tok.char_value); - break; - default: - if (ctype.isprint(tok.char_value)) { - len = snprintf(tmp, elemsof(tmp), "'%c'", tok.char_value); - } else { - tmp[0] = 0; - // TODO print nicely (eg \n etc) - } - break; - } - out.add(tmp); - ctx.offset = tok.loc + len; - break; - case StringLiteral: - out.color(col_string); - u32 len = out.encodeBytes(ctx.pool.idx2str(tok.text_idx), tok.text_len, '"'); - ctx.offset = tok.loc + len; - break; - case LineComment: - out.color(col_comment); - const char* str = ctx.pool.idx2str(tok.text_idx); - out.print("//%s", str); - ctx.offset = tok.loc + (u32)strlen(str) + 2; - break; - case BlockComment: - out.color(col_comment); - const char* str = ctx.pool.idx2str(tok.text_idx); - out.print("/*%s*/", str); - ctx.offset = tok.loc + (u32)strlen(str) + 4; - break; - case Invalid: - out.color(col_invalid); - out.print("%s", tok.invalid); - ctx.offset = tok.loc + (u32)strlen(tok.invalid); +fn void C2cat.print_error(C2cat* ctx, + c2_tokenizer.ErrorLevel level, + SrcLoc loc, + const char* format @(printf_format), ...) +{ + string_buffer.Buf* out2 = ctx.out2; + if (!ctx.check_only && !ctx.out.endsWith('\n')) out2.add1('\n'); + out2.color(ctx.style_color[Error]); + Location sloc.init(ctx.input, loc - ctx.tokenizer.loc_start); + if (sloc.line) { + out2.print("%s:%d:%d: ", ctx.filename, sloc.line, sloc.column); + } else { + out2.print("%s: ", ctx.filename); + } + switch (ctx.error_level = level) { + case Note: + out2.add("note: "); break; - case Error: - out.add1('\n'); - out.color(col_error); - out.print("error: %s", ctx.tokenizer.error_msg); - out.color(col_normal); - out.add1('\n'); + case Warning: + out2.add("warning: "); break; default: - out.color(col_error); - out.print("token %s\n", tok.kind.name); - ctx.offset = tok.loc + 1; + out2.add("error: "); break; } - out.color(col_normal); + va_list args; + va_start(args, format); + out2.vprintf(format, args); + out2.color(ctx.style_color[Normal]); + out2.add1('\n'); + va_end(args); + if (sloc.line) { + out2.print("%.*s\n", (i32)sloc.line_length, ctx.input + sloc.line_start); + out2.print("%*s^\n", (i32)sloc.column - 1, ""); + } + if (!ctx.check_only) { + fputs(ctx.out.data(), stdout); + fflush(stdout); + ctx.out.clear(); + } + fputs(ctx.out2.data(), stderr); + fflush(stderr); + ctx.out2.clear(); + ctx.has_error = true; } -public fn i32 c2cat(const char* filename, bool use_color) +fn void C2cat.on_tokenizer_error(void* arg, c2_tokenizer.ErrorLevel level, SrcLoc loc, const char* msg) { + C2cat* ctx = arg; + ctx.print_error(level, loc, "%s", msg); +} + +fn i32 c2cat_file(const char* filename, bool use_color, bool use_color2, bool check_only) { file_utils.File file.init(filename); if (!file.load()) { @@ -247,62 +288,138 @@ public fn i32 c2cat(const char* filename, bool use_color) return -1; } - C2cat ctx = { } - ctx.pool = string_pool.create(16*1024, 1024); - ctx.out = string_buffer.create(16*1024, use_color, 2); - ctx.offset = 0; - ctx.input = file.data(); - ctx.in_attributes = 0; + string_pool.Pool* pool = string_pool.create(16*1024, 1024); + string_buffer.Buf* out = string_buffer.create(16*1024, use_color, 2); + string_buffer.Buf* out2 = string_buffer.create(1024, use_color2, 2); + string_buffer.Buf* buf = string_buffer.create(1024, false, 0); + const char* input = file.data(); u32 file_size = file.data_size(); + keywords.Info kwinfo.init(pool); + string_map.Map features.init(pool); - string_map.Map features.init(ctx.pool); - string_buffer.Buf* buf = string_buffer.create(1024, false, 0); - keywords.Info kwinfo.init(ctx.pool); - c2_tokenizer.Tokenizer tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, true); + C2cat ctx = { + .pool = pool, + .out = out, + .out2 = out2, + .filename = filename, + .input = input, + .offset = 0, + .length = file_size, + .in_attributes = 0, + .check_only = check_only, + } + + ctx.init_colors(); + + c2_tokenizer.Tokenizer tokenizer.init(pool, buf, input, 1, &kwinfo, &features, + C2cat.on_tokenizer_error, &ctx, true); ctx.tokenizer = &tokenizer; ctx.attr_registry.init(ctx.pool); - Token tok; - tok.init(); + Token tok.init(); while (!tok.done) { tokenizer.lex(&tok); //printf("%4d %s\n", tok.loc, tok.kind.name); - + if (ctx.error_level) { + if (ctx.error_level == FatalError) goto done; + ctx.error_level = (c2_tokenizer.ErrorLevel)0; + } ctx.update_state(&tok); - ctx.print_token(&tok); } - if (ctx.offset <= file_size) { - u32 len = file_size - ctx.offset; - if (len) ctx.out.add2(ctx.input + ctx.offset, len); - } else { - ctx.out.add1('\n'); - ctx.out.color(col_error); - ctx.out.print("error: offset=%d file_size=%d", ctx.offset, file_size); - ctx.out.color(col_normal); - ctx.out.add1('\n'); + if (ctx.offset < ctx.length) { + // EOF token occurs before end of file, this is an error + ctx.print_error(Error, tok.loc, "invalid EOF token position (offset=%d length=%d)", + ctx.offset, ctx.length); + } + if (ctx.offset > ctx.length) { + if (!ctx.check_only) { + u32 len = ctx.length - ctx.offset; + out.color(ctx.style_color[Error]); + out.add2(ctx.input + ctx.offset, len); + out.color(ctx.style_color[Normal]); + } + // EOF token should have ctx.offset == ctx.length + ctx.print_error(Warning, tok.loc, "trailing bytes after end of file token"); + } + if (!ctx.check_only) { + fputs(out.data(), stdout); + fflush(stdout); } - fputs(ctx.out.data(), stdout); - fflush(stdout); - ctx.pool.free(); - ctx.out.free(); +done: + out.free(); + out2.free(); buf.free(); + pool.free(); file.close(); - return 0; + return ctx.has_error; } public fn i32 main(i32 argc, const char** argv) { bool use_color = color.useColor(); - if (argc == 1) usage(argv[0]); + bool use_color2 = unistd.isatty(2); + bool check_only = false; + i32 status = 0; + + // parse all command line options and count files + u32 nfiles = 0; + bool done_options = false; for (i32 i = 1; i < argc; i++) { - if (argc > 2) - printf("==> %s <==\n", argv[i]); - c2cat(argv[i], use_color); + const char* arg = argv[i]; + if (*arg == '-' && !done_options) { + switch (arg) { + case "--": + done_options = true; + break; + case "-c": + case "--check": + check_only = true; + break; + case "--color": + use_color = true; + break; + case "--nocolor": + use_color = false; + use_color2 = false; + break; + case "-?": + case "-h": + case "--help": + usage(argv[0]); + exit(EXIT_SUCCESS); + default: + fprintf(stderr, "c2cat: unknown option %s\n", arg); + exit(EXIT_FAILURE); + } + } else { + nfiles++; + } + } + + if (!nfiles) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + done_options = false; + u32 filenum = 0; + for (i32 i = 1; i < argc; i++) { + const char* arg = argv[i]; + if (*arg == '-' && !done_options) { + done_options = !strcmp(arg, "--"); + continue; + } + if (nfiles > 1 && !check_only) { + if (filenum++) printf("\n"); + printf("==> %s <==\n", arg); + } + if (c2cat_file(arg, use_color, use_color2, check_only)) + status = EXIT_FAILURE; } - return 0; + return status; }