From 92d6fbe35efde8232804defda94089684f77337e Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:03:36 +0100 Subject: [PATCH 01/15] git: add pprof to gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 57edc61..1a4e801 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ test/*.json test/test -test/*.pprof +*.pprof From 1b952152a49d04587239b9e91022cacbb7000d3b Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:03:57 +0100 Subject: [PATCH 02/15] go: update to g1.26 --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 3d85827..2724904 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/xnacly/libjson -go 1.23.0 +go 1.26.0 require github.com/stretchr/testify v1.9.0 From c669f22b7d82f5f5dd8e9d0258544364d13cbfc7 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:26:15 +0100 Subject: [PATCH 03/15] cmd/lj: rework the cli with options and flags --- cmd/lj.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/cmd/lj.go b/cmd/lj.go index f87b538..7922dc8 100644 --- a/cmd/lj.go +++ b/cmd/lj.go @@ -1,9 +1,14 @@ package main import ( + "encoding/json" + "flag" "fmt" "log" "os" + "path/filepath" + "runtime/debug" + "runtime/pprof" "github.com/xnacly/libjson" ) @@ -16,17 +21,59 @@ func Must[T any](t T, err error) T { } func main() { - args := os.Args + noGc := flag.Bool("nogc", false, "disable the go garbage collector") + useLibjson := flag.Bool("libjson", true, "use libjson, if false use encoding/json") + usePprof := flag.Bool("pprof", false, "use pprof cpu tracing") + query := flag.String("q", ".", "query the parsed json") + silent := flag.Bool("s", false, "no stdoutput") + flag.Parse() + + if *noGc { + debug.SetGCPercent(-1) + } + + args := flag.Args() + + var filePath string var file *os.File if info, err := os.Stdin.Stat(); err != nil || info.Mode()&os.ModeCharDevice != 0 { // we are in a pipe - if len(args) == 1 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") + if len(args) == 0 { + log.Fatalln("Wanted a file as an argument, got nothing, exiting") } - file = Must(os.Open(args[1])) + filePath = args[0] + file = Must(os.Open(filePath)) } else { file = os.Stdin + filePath = "stdin" + } + + if *usePprof { + f, err := os.Create(filepath.Base(filePath) + ".pprof") + if err != nil { + panic(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + + if *useLibjson { + out := Must(libjson.NewReader(file)) + if !*silent { + fmt.Printf("%+#v\n", Must(libjson.Get[any](&out, *query))) + } + } else { + if *query != "." { + panic("With -libjson=false, there is no support for querying the json") + } + + decoder := json.NewDecoder(file) + var a any + if err := decoder.Decode(&a); err != nil { + panic(err) + } + + if !*silent { + fmt.Printf("%+#v\n", a) + } } - query := os.Args[len(os.Args)-1] - json := Must(libjson.NewReader(file)) - fmt.Printf("%+#v\n", Must(libjson.Get[any](&json, query))) } From 7410c209396338266a5f1562fdb85c595a1b63dd Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:26:42 +0100 Subject: [PATCH 04/15] test: replace test binary with cmd/lj --- test/bench.sh | 8 ++++---- test/test.go | 49 ------------------------------------------------- 2 files changed, 4 insertions(+), 53 deletions(-) delete mode 100644 test/test.go diff --git a/test/bench.sh b/test/bench.sh index 9585625..9162698 100755 --- a/test/bench.sh +++ b/test/bench.sh @@ -4,8 +4,8 @@ python3 gen.py echo "building executable" rm ./test -go build ./test.go +go build -o ./test ../cmd/lj.go -hyperfine "./test ./1MB.json" "./test -libjson=false ./1MB.json" -hyperfine "./test ./5MB.json" "./test -libjson=false ./5MB.json" -hyperfine "./test ./10MB.json" "./test -libjson=false ./10MB.json" +hyperfine "./test -s ./1MB.json" "./test -s -libjson=false ./1MB.json" +hyperfine "./test -s ./5MB.json" "./test -s -libjson=false ./5MB.json" +hyperfine "./test -s ./10MB.json" "./test -s -libjson=false ./10MB.json" diff --git a/test/test.go b/test/test.go deleted file mode 100644 index 3a6dfe0..0000000 --- a/test/test.go +++ /dev/null @@ -1,49 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "log" - "os" - - // "runtime/pprof" - - "github.com/xnacly/libjson" -) - -func main() { - // f, err := os.Create("cpu.pprof") - // if err != nil { - // panic(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - lj := flag.Bool("libjson", true, "benchmark libjson or gojson") - flag.Parse() - args := flag.Args() - if len(args) == 0 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") - } - file, err := os.Open(args[0]) - if err != nil { - log.Fatalln(err) - } - if *lj { - _, err := libjson.NewReader(file) - if err != nil { - log.Fatalln(err) - } - } else { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} - d := json.NewDecoder(file) - err := d.Decode(&v) - if err != nil { - log.Fatalln(err) - } - } -} From 815f8815ea18877b49c99379351f8e38d39dda5a Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:16:29 +0100 Subject: [PATCH 05/15] lexer+parser: support ecma404 escape characters --- cmd/lj.go | 4 +-- lexer.go | 19 +++++++++++- parser.go | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- types.go | 25 ++++++++------- 4 files changed, 125 insertions(+), 16 deletions(-) diff --git a/cmd/lj.go b/cmd/lj.go index 7922dc8..8689011 100644 --- a/cmd/lj.go +++ b/cmd/lj.go @@ -59,7 +59,7 @@ func main() { if *useLibjson { out := Must(libjson.NewReader(file)) if !*silent { - fmt.Printf("%+#v\n", Must(libjson.Get[any](&out, *query))) + fmt.Printf("%#+v\n", Must(libjson.Get[any](&out, *query))) } } else { if *query != "." { @@ -73,7 +73,7 @@ func main() { } if !*silent { - fmt.Printf("%+#v\n", a) + fmt.Printf("%#+v\n", a) } } } diff --git a/lexer.go b/lexer.go index e3888d7..e4715b9 100644 --- a/lexer.go +++ b/lexer.go @@ -58,10 +58,27 @@ func (l *lexer) next() (token, error) { case '"': start := l.pos for i := start; i < len(l.data); i++ { - if l.data[i] == '"' { + switch l.data[i] { + case '"': t := token{Type: t_string, Start: start, End: i} l.pos = i + 1 return t, nil + case '\\': // OH NO ITS ESCAPING :O + i++ + if i >= len(l.data) { + return empty, errors.New("Unterminated string escape") + } + switch l.data[i] { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + // we simply skip the escaped char, the parser has to + case 'u': + if i+4 > len(l.data) { + return empty, errors.New("Unterminated string") + } + i += 4 + default: + return empty, fmt.Errorf("Invalid escape %q", l.data[i]) + } } } return empty, errors.New("Unterminated string") diff --git a/parser.go b/parser.go index 98faa54..49ea53f 100644 --- a/parser.go +++ b/parser.go @@ -1,7 +1,9 @@ package libjson import ( + "errors" "fmt" + "unicode/utf8" "unsafe" ) @@ -161,17 +163,106 @@ func (p *parser) array() ([]any, error) { return a, p.advance() } +func hex4(b []byte) (r rune, err error) { + r = 0 + for _, c := range b { + r <<= 4 + switch { + case '0' <= c && c <= '9': + r += rune(c - '0') + case 'a' <= c && c <= 'f': + r += rune(c - 'a' + 10) + case 'A' <= c && c <= 'F': + r += rune(c - 'A' + 10) + default: + return 0, fmt.Errorf("invalid hex %q", c) + } + } + return r, nil +} + +// unescapes escapes in a buffer, returns the end of the in place escaped +// buffer so the caller can resize to the new, smaller buffer size +func unescapeInPlace(in []byte) (int, error) { + curEnd := 0 + for i := 0; i < len(in); i++ { + b := in[i] + if b != '\\' { + in[curEnd] = b + curEnd++ + continue + } + + i++ // skip \ + + switch in[i] { + case '"', '\\', '/': + in[curEnd] = in[i] + curEnd++ + case 'b': + in[curEnd] = '\b' + curEnd++ + case 'f': + in[curEnd] = '\f' + curEnd++ + case 'n': + in[curEnd] = '\n' + curEnd++ + case 'r': + in[curEnd] = '\r' + curEnd++ + case 't': + in[curEnd] = '\t' + curEnd++ + case 'u': // \uXXXX + + // From ECMA-404: + // + // However, whether a processor of JSON texts interprets such a surrogate pair + // as a single code point or as an explicit surrogate pair is a semantic + // decision that is determined by the specific processor. + // + // meaning we dont merge unicode points, firstly because fuck + // utf16, and secondly because its simpler to just keep two unicode + // points separate compared to increasing the complexity of this + // decoding + + i++ // skip u + + if i+4 > len(in) { + return 0, errors.New("unterminated unicode escape") + } + + r, err := hex4(in[i : i+4]) + if err != nil { + return 0, err + } + + n := utf8.EncodeRune(in[curEnd:], r) + curEnd += n + i += 4 + } + } + + return curEnd, nil +} + func (p *parser) atom() (any, error) { var r any switch p.cur_tok.Type { case t_string: in := p.input[p.cur_tok.Start:p.cur_tok.End] + end, err := unescapeInPlace(in) + if err != nil { + return nil, err + } + in = in[:end] r = *(*string)(unsafe.Pointer(&in)) case t_number: raw := p.input[p.cur_tok.Start:p.cur_tok.End] number, err := parseFloat(raw) if err != nil { - return empty, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) + return nil, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) } r = number case t_true: diff --git a/types.go b/types.go index 86ff403..9034629 100644 --- a/types.go +++ b/types.go @@ -13,18 +13,19 @@ type token struct { var empty = token{Type: t_eof} const ( - t_string t_json = iota // anything between "" - t_number // floating point, hex, etc - t_true // true - t_false // false - t_null // null - t_left_curly // { - t_right_curly // } - t_left_braket // [ - t_right_braket // ] - t_comma // , - t_colon // : - t_eof // for any non structure characters outside of strings and numbers + t_string t_json = iota // anything between "" + t_string_escaped // t_string but contains an escape char + t_number // floating point, hex, etc + t_true // true + t_false // false + t_null // null + t_left_curly // { + t_right_curly // } + t_left_braket // [ + t_right_braket // ] + t_comma // , + t_colon // : + t_eof // for any non structure characters outside of strings and numbers ) var tokennames = map[t_json]string{ From 86709d848d8a0723c42d7b869ac0f50411b243ba Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:20:17 +0100 Subject: [PATCH 06/15] docs: change ecma404 ref --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 173bc53..b913b35 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # libjson -> WARNING: libjson is currently a work in progress :) - Fast and minimal JSON parser written in and for Go with a JIT query language ```go @@ -13,16 +11,16 @@ import ( func main() { input := `{ "hello": {"world": ["hi"] } }` - jsonObj, _ := New(input) // or libjson.NewReader(r io.Reader) + jsonObj, _ := libjson.New([]byte(input)) // or libjson.NewReader(r io.Reader) // accessing values - fmt.Println(Get[string](jsonObj, ".hello.world.0")) // hi, nil + fmt.Println(libjson.Get[string](jsonObj, ".hello.world.0")) // hi, nil } ``` ## Features -- [ECMA 404](https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf) +- [ECMA 404](https://ecma-international.org/publications-and-standards/standards/ecma-404/) and [rfc8259](https://www.rfc-editor.org/rfc/rfc8259) compliant - tests against [JSONTestSuite](https://github.com/nst/JSONTestSuite), see [Parsing JSON is a Minefield From 74c795ccc7f26afb4fd3433701d82741ac008168 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:23:22 +0100 Subject: [PATCH 07/15] types: remove t_string_escaped --- types.go | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/types.go b/types.go index 9034629..86ff403 100644 --- a/types.go +++ b/types.go @@ -13,19 +13,18 @@ type token struct { var empty = token{Type: t_eof} const ( - t_string t_json = iota // anything between "" - t_string_escaped // t_string but contains an escape char - t_number // floating point, hex, etc - t_true // true - t_false // false - t_null // null - t_left_curly // { - t_right_curly // } - t_left_braket // [ - t_right_braket // ] - t_comma // , - t_colon // : - t_eof // for any non structure characters outside of strings and numbers + t_string t_json = iota // anything between "" + t_number // floating point, hex, etc + t_true // true + t_false // false + t_null // null + t_left_curly // { + t_right_curly // } + t_left_braket // [ + t_right_braket // ] + t_comma // , + t_colon // : + t_eof // for any non structure characters outside of strings and numbers ) var tokennames = map[t_json]string{ From 12878d75c59c0070e26e9e0da85ab087abbbb98c Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:26:25 +0100 Subject: [PATCH 08/15] parser: fix off by one error in unescapeInPlace unicode handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change "\uD834\uDD1E" would result in "�DD1E" but should have resulted in "��", due to both being unmerged surrogates. --- parser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser.go b/parser.go index 49ea53f..34e1a02 100644 --- a/parser.go +++ b/parser.go @@ -240,7 +240,7 @@ func unescapeInPlace(in []byte) (int, error) { n := utf8.EncodeRune(in[curEnd:], r) curEnd += n - i += 4 + i += 3 } } From df358157ef0d705239f119a5ed7d264e0456833d Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 10:25:05 +0100 Subject: [PATCH 09/15] parser: update benchmarking input --- README.md | 10 ++++++ cmd/lj.go | 18 ++++++++--- json.go | 5 +-- json_test.go | 71 ++++++++++++++++++++++++++++++++++++------- lexer.go | 29 ++++++++++-------- parser_test.go | 10 +++--- types.go => tokens.go | 0 7 files changed, 109 insertions(+), 34 deletions(-) rename types.go => tokens.go (100%) diff --git a/README.md b/README.md index b913b35..382c944 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ func main() { ## Features +- Parser consumes and mutates the input to make most operations zero copy and zero alloc - [ECMA 404](https://ecma-international.org/publications-and-standards/standards/ecma-404/) and [rfc8259](https://www.rfc-editor.org/rfc/rfc8259) compliant - tests against [JSONTestSuite](https://github.com/nst/JSONTestSuite), see @@ -33,6 +34,15 @@ func main() { - caching of queries with `libjson.Compile`, just in time caching of queries - serialisation via `json.Marshal` +## Why is it faster than encoding/json? + +- zero-copy strings +- mutate input for string escaping instead of allocating a new one +- no allocations for strings, views into the original input +- no reflection +- no copies for map keys +- very simple lexer and parser + ## Benchmarks ![libjson-vs-encodingjson](https://github.com/user-attachments/assets/b11bcce4-e7db-4c45-ab42-45a2042e2a51) diff --git a/cmd/lj.go b/cmd/lj.go index 8689011..c65f6fc 100644 --- a/cmd/lj.go +++ b/cmd/lj.go @@ -26,6 +26,7 @@ func main() { usePprof := flag.Bool("pprof", false, "use pprof cpu tracing") query := flag.String("q", ".", "query the parsed json") silent := flag.Bool("s", false, "no stdoutput") + escape := flag.Bool("e", false, "escapes input with Gos '%#+v'") flag.Parse() if *noGc { @@ -59,7 +60,12 @@ func main() { if *useLibjson { out := Must(libjson.NewReader(file)) if !*silent { - fmt.Printf("%#+v\n", Must(libjson.Get[any](&out, *query))) + out := Must(libjson.Get[any](&out, *query)) + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } } } else { if *query != "." { @@ -67,13 +73,17 @@ func main() { } decoder := json.NewDecoder(file) - var a any - if err := decoder.Decode(&a); err != nil { + var out any + if err := decoder.Decode(&out); err != nil { panic(err) } if !*silent { - fmt.Printf("%#+v\n", a) + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } } } } diff --git a/json.go b/json.go index 8477a8c..c433ffb 100644 --- a/json.go +++ b/json.go @@ -9,7 +9,7 @@ func NewReader(r io.Reader) (JSON, error) { if err != nil { return JSON{}, err } - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err @@ -17,8 +17,9 @@ func NewReader(r io.Reader) (JSON, error) { return JSON{obj}, nil } +// data is consumed and possibly mutated, DO NOT REUSE func New(data []byte) (JSON, error) { - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err diff --git a/json_test.go b/json_test.go index f7af6c3..37dcef9 100644 --- a/json_test.go +++ b/json_test.go @@ -9,31 +9,80 @@ import ( ) const amount = 50_000 +const naiveInput = `{"key1":"value","array":[],"obj":{},"atomArray":[11201,1e112,true,false,null,"str"]},` +const escapedInput = `{"text":"line1\nline2\nline3","quote":"\"hello\"","path":"C:\\\\Users\\\\name","unicode":"\u0041\u0042\u0043","mix":"abc\\ndef\"ghi\u263A"},` +const hardInput = `{ + "id":12345, + "name":"very_long_string_with_no_escapes_but_large_payload_abcdefghijklmnopqrstuvwxyz_0123456789", + "description":"This string contains\nmultiple\nlines\nand \"quotes\" and unicode \u2764\u2764\u2764", + "nested":{ + "level1":{ + "level2":{ + "array":[ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + 1234567890, + -1.2345e67, + true, + false, + null + ] + } + } + } +},` -func BenchmarkLibJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := New(d) + buf := make([]byte, len(d)) + copy(buf, d) + b.StartTimer() + _, err := New(buf) + b.StopTimer() assert.NoError(b, err) } b.ReportAllocs() } -func BenchmarkEncodingJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkEncodingJsonWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} + var v any err := json.Unmarshal(d, &v) assert.NoError(b, err) } b.ReportAllocs() } + +func BenchmarkLibJson_Naive(b *testing.B) { + benchmarkWithInput(b, naiveInput) +} + +func BenchmarkLibJson_Escaped(b *testing.B) { + benchmarkWithInput(b, escapedInput) +} + +func BenchmarkLibJson_Hard(b *testing.B) { + benchmarkWithInput(b, hardInput) +} + +func BenchmarkEncodingJson_Naive(b *testing.B) { + benchmarkEncodingJsonWithInput(b, naiveInput) +} + +func BenchmarkEncodingJson_Escaped(b *testing.B) { + benchmarkEncodingJsonWithInput(b, escapedInput) +} + +func BenchmarkEncodingJson_Hard(b *testing.B) { + benchmarkEncodingJsonWithInput(b, hardInput) +} diff --git a/lexer.go b/lexer.go index e4715b9..7ae9a3d 100644 --- a/lexer.go +++ b/lexer.go @@ -9,6 +9,7 @@ import ( type lexer struct { data []byte pos int + len int } var numChar [256]bool @@ -25,7 +26,7 @@ func init() { } func (l *lexer) next() (token, error) { - for l.pos < len(l.data) { + for l.pos < l.len { cc := l.data[l.pos] if cc == ' ' || cc == '\n' || cc == '\t' || cc == '\r' { l.pos++ @@ -34,7 +35,7 @@ func (l *lexer) next() (token, error) { } } - if l.pos >= len(l.data) { + if l.pos >= l.len { return empty, nil } @@ -57,22 +58,25 @@ func (l *lexer) next() (token, error) { tt = t_colon case '"': start := l.pos - for i := start; i < len(l.data); i++ { - switch l.data[i] { - case '"': + for i := start; i < l.len; i++ { + if c := l.data[i]; c == '"' { t := token{Type: t_string, Start: start, End: i} + // if hasEscaped { + // t.Type = t_string_escaped + // } l.pos = i + 1 return t, nil - case '\\': // OH NO ITS ESCAPING :O + } else if c == '\\' { // OH NO ITS ESCAPING :O i++ - if i >= len(l.data) { + if i >= l.len { return empty, errors.New("Unterminated string escape") } + switch l.data[i] { case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': // we simply skip the escaped char, the parser has to case 'u': - if i+4 > len(l.data) { + if i+4 > l.len { return empty, errors.New("Unterminated string") } i += 4 @@ -83,7 +87,7 @@ func (l *lexer) next() (token, error) { } return empty, errors.New("Unterminated string") case 't': // this should always be the 'true' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'true' atom") } if !(l.data[l.pos] == 'r' && l.data[l.pos+1] == 'u' && l.data[l.pos+2] == 'e') { @@ -92,7 +96,7 @@ func (l *lexer) next() (token, error) { l.pos += 3 tt = t_true case 'f': // this should always be the 'false' atom and is therefore optimised here - if l.pos+4 > len(l.data) { + if l.pos+4 > l.len { return empty, errors.New("Failed to read the expected 'false' atom") } if !(l.data[l.pos] == 'a' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 's' && l.data[l.pos+3] == 'e') { @@ -101,7 +105,7 @@ func (l *lexer) next() (token, error) { l.pos += 4 tt = t_false case 'n': // this should always be the 'null' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'null' atom") } if !(l.data[l.pos] == 'u' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 'l') { @@ -112,7 +116,7 @@ func (l *lexer) next() (token, error) { default: if cc == '-' || (cc >= '0' && cc <= '9') { start := l.pos - 1 - for l.pos < len(l.data) && numChar[l.data[l.pos]] { + for l.pos < l.len && numChar[l.data[l.pos]] { l.pos++ } @@ -132,6 +136,7 @@ func (l *lexer) lex(r io.Reader) ([]token, error) { if err != nil { return nil, err } + l.len = len(l.data) toks := make([]token, 0, len(l.data)/2) for { diff --git a/parser_test.go b/parser_test.go index 907473b..30731f5 100644 --- a/parser_test.go +++ b/parser_test.go @@ -30,7 +30,7 @@ func TestParserAtoms(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -54,7 +54,7 @@ func TestParserArray(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -82,7 +82,7 @@ func TestParserObject(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -110,7 +110,7 @@ func TestParserEdge(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -145,7 +145,7 @@ func TestParserFail(t *testing.T) { for _, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.Error(t, err) assert.Nil(t, out) diff --git a/types.go b/tokens.go similarity index 100% rename from types.go rename to tokens.go From 7ff2cb2b9fca5b8a053bf954c4f6987a3315f4ed Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 11:48:44 +0100 Subject: [PATCH 10/15] benchmarks: use heavier input in benchmarking (1-100MB) --- .gitignore | 4 ++-- benchmarks/bench.sh | 15 +++++++++++++++ benchmarks/gen.py | 29 +++++++++++++++++++++++++++++ parser.go | 4 ++-- test/bench.sh | 11 ----------- test/gen.py | 22 ---------------------- 6 files changed, 48 insertions(+), 37 deletions(-) create mode 100755 benchmarks/bench.sh create mode 100644 benchmarks/gen.py delete mode 100755 test/bench.sh delete mode 100644 test/gen.py diff --git a/.gitignore b/.gitignore index 1a4e801..ead7226 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -test/*.json -test/test +benchmarks/*.json +benchmarks/test *.pprof diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh new file mode 100755 index 0000000..0787939 --- /dev/null +++ b/benchmarks/bench.sh @@ -0,0 +1,15 @@ +#!/bin/bash +echo "generating example data" +python3 gen.py + +echo "building executable" +rm ./test +go build -o ./test ../cmd/lj.go + +for SIZE in 1MB 5MB 10MB 100MB; do + hyperfine \ + --warmup 1 \ + --runs 10 \ + "./test -s ./${SIZE}.json" \ + "./test -s -libjson=false ./${SIZE}.json" +done diff --git a/benchmarks/gen.py b/benchmarks/gen.py new file mode 100644 index 0000000..d540015 --- /dev/null +++ b/benchmarks/gen.py @@ -0,0 +1,29 @@ +from os.path import exists +import math +import json + +sizes =[1,5,10,100] + +line = json.dumps({ + "id": 12345, + "name": "very_long_string_with_no_escapes_but_large_payload_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode ❤❤❤", + "nested": { + "level1": { + "level2": { + "array": ["short", "string_with_escape\n", "another\tvalue", "unicode\u2603", 1234567890, -1.2345e67, True, False, None] + } + } + } +}) + +def write_data(size: int): + name = f"{size}MB.json" + if not exists(name): + with open(name, mode="w", encoding="utf8") as f: + f.write("[\n") + size = math.floor((size*1000000)/len(line)) + f.write(",\n".join([line for _ in range(0, size)])) + f.write("\n]") + +[write_data(size) for size in sizes] diff --git a/parser.go b/parser.go index 34e1a02..e78e802 100644 --- a/parser.go +++ b/parser.go @@ -59,7 +59,7 @@ func (p *parser) object() (map[string]any, error) { return nil, err } - m := make(map[string]any, 4) + m := make(map[string]any) if p.cur_tok.Type == t_right_curly { err := p.advance() @@ -137,7 +137,7 @@ func (p *parser) array() ([]any, error) { return []any{}, p.advance() } - a := make([]any, 0, 8) + a := make([]any, 0) for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_braket { if len(a) > 0 { diff --git a/test/bench.sh b/test/bench.sh deleted file mode 100755 index 9162698..0000000 --- a/test/bench.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -echo "generating example data" -python3 gen.py - -echo "building executable" -rm ./test -go build -o ./test ../cmd/lj.go - -hyperfine "./test -s ./1MB.json" "./test -s -libjson=false ./1MB.json" -hyperfine "./test -s ./5MB.json" "./test -s -libjson=false ./5MB.json" -hyperfine "./test -s ./10MB.json" "./test -s -libjson=false ./10MB.json" diff --git a/test/gen.py b/test/gen.py deleted file mode 100644 index 50d2bcb..0000000 --- a/test/gen.py +++ /dev/null @@ -1,22 +0,0 @@ -from os.path import exists -import math - -sizes =[1,5,10] - -line = """\t{ - "key1": "value", - "array": [], - "obj": {}, - "atomArray": [11201,1e112,true,false,null,"str"] - }""" - -def write_data(size: int): - name = f"{size}MB.json" - if not exists(name): - with open(name, mode="w", encoding="utf8") as f: - f.write("[\n") - size = math.floor((size*1000000)/len(line)) - f.write(",\n".join([line for _ in range(0, size)])) - f.write("\n]") - -[write_data(size) for size in sizes] From 12bf614fcadd6c90ee70f7ea7574fb325fe01d22 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:35:13 +0100 Subject: [PATCH 11/15] parser: remove bounds checks in unescapeInPlace Previously time spent for parsing 100MB JSON input (600ms) took 60ms in a number of unnecessary bound checks: CALL runtime.panicBounds(SB), now reduced to 20ms by moving explicit bound checks before indizes, reusing indexed slots and merging manual out of loop increments. --- lexer.go | 3 --- parser.go | 29 ++++++++++++++++++----------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/lexer.go b/lexer.go index 7ae9a3d..d49434b 100644 --- a/lexer.go +++ b/lexer.go @@ -61,9 +61,6 @@ func (l *lexer) next() (token, error) { for i := start; i < l.len; i++ { if c := l.data[i]; c == '"' { t := token{Type: t_string, Start: start, End: i} - // if hasEscaped { - // t.Type = t_string_escaped - // } l.pos = i + 1 return t, nil } else if c == '\\' { // OH NO ITS ESCAPING :O diff --git a/parser.go b/parser.go index e78e802..f759e60 100644 --- a/parser.go +++ b/parser.go @@ -181,8 +181,13 @@ func hex4(b []byte) (r rune, err error) { return r, nil } -// unescapes escapes in a buffer, returns the end of the in place escaped -// buffer so the caller can resize to the new, smaller buffer size +// unescapes JSON escapes in a buffer into their non-JSON representation +// +// Returns the end of the in place escaped buffer so the caller can resize to +// the new, smaller buffer size +// +// The implementation may look weird, but is optimised to have the least +// possible branches func unescapeInPlace(in []byte) (int, error) { curEnd := 0 for i := 0; i < len(in); i++ { @@ -193,11 +198,16 @@ func unescapeInPlace(in []byte) (int, error) { continue } + // check if there’s at least 1 more byte for the escape + if i+1 >= len(in) { + return 0, errors.New("unterminated escape") + } i++ // skip \ + b = in[i] - switch in[i] { + switch b { case '"', '\\', '/': - in[curEnd] = in[i] + in[curEnd] = b curEnd++ case 'b': in[curEnd] = '\b' @@ -227,21 +237,18 @@ func unescapeInPlace(in []byte) (int, error) { // points separate compared to increasing the complexity of this // decoding - i++ // skip u - - if i+4 > len(in) { + if i+4 >= len(in) { return 0, errors.New("unterminated unicode escape") } - r, err := hex4(in[i : i+4]) + r, err := hex4(in[i+1 : i+5]) if err != nil { return 0, err } - n := utf8.EncodeRune(in[curEnd:], r) curEnd += n - i += 3 - } + i += 4 + } // we dont need a default case since we check all possible escapes in the lexer } return curEnd, nil From 0f98f84c93dbdc2dc06b36f19bb23fe4fe4ef3c2 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:49:43 +0100 Subject: [PATCH 12/15] benchmarks: deeper nested and more escapes in benchmark --- benchmarks/gen.py | 22 ++++++++++++++++++++-- json_test.go | 48 +++++++++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/benchmarks/gen.py b/benchmarks/gen.py index d540015..4afb15a 100644 --- a/benchmarks/gen.py +++ b/benchmarks/gen.py @@ -6,12 +6,30 @@ line = json.dumps({ "id": 12345, - "name": "very_long_string_with_no_escapes_but_large_payload_abcdefghijklmnopqrstuvwxyz_0123456789", + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode ❤❤❤", "nested": { "level1": { "level2": { - "array": ["short", "string_with_escape\n", "another\tvalue", "unicode\u2603", 1234567890, -1.2345e67, True, False, None] + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 1234567890, + -1.2345e67, + 3.1415926535897932384626433832795028841971, + True, + False, + None, + "\u0041\u0042\u0043\u00A9\u20AC", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } } } } diff --git a/json_test.go b/json_test.go index 37dcef9..85ccdd2 100644 --- a/json_test.go +++ b/json_test.go @@ -12,26 +12,34 @@ const amount = 50_000 const naiveInput = `{"key1":"value","array":[],"obj":{},"atomArray":[11201,1e112,true,false,null,"str"]},` const escapedInput = `{"text":"line1\nline2\nline3","quote":"\"hello\"","path":"C:\\\\Users\\\\name","unicode":"\u0041\u0042\u0043","mix":"abc\\ndef\"ghi\u263A"},` const hardInput = `{ - "id":12345, - "name":"very_long_string_with_no_escapes_but_large_payload_abcdefghijklmnopqrstuvwxyz_0123456789", - "description":"This string contains\nmultiple\nlines\nand \"quotes\" and unicode \u2764\u2764\u2764", - "nested":{ - "level1":{ - "level2":{ - "array":[ - "short", - "string_with_escape\\n", - "another\\tvalue", - "unicode\u2603", - 1234567890, - -1.2345e67, - true, - false, - null - ] - } - } - } + "id": 12345, + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode \u2764\u2764\u2764", + "nested": { + "level1": { + "level2": { + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 1234567890, + -1.2345e67, + 3.141592653589793, + true, + false, + null, + "ABC\u00a9\u20ac", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } + } + } + } },` func benchmarkWithInput(b *testing.B, input string) { From e4b2043bbdcacf005f1e3d7bf5e151030d6ff1f6 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:58:26 +0100 Subject: [PATCH 13/15] parser: ripped hex out and make it table driven Reduced time taken in unescapeInPlace by 30ms (from 5.75% to 3.41%) --- benchmarks/gen.py | 6 +++--- hex.go | 54 +++++++++++++++++++++++++++++++++++++++++++++++ parser.go | 22 +++---------------- 3 files changed, 60 insertions(+), 22 deletions(-) create mode 100644 hex.go diff --git a/benchmarks/gen.py b/benchmarks/gen.py index 4afb15a..f169beb 100644 --- a/benchmarks/gen.py +++ b/benchmarks/gen.py @@ -7,7 +7,7 @@ line = json.dumps({ "id": 12345, "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", - "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode ❤❤❤", + "description": "This string contains\nmultiple\nlines\nand \"quotes\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"", "nested": { "level1": { "level2": { @@ -19,13 +19,13 @@ "another\\tvalue", "unicode\u2603", "escaped_quote_\"_and_backslash_\\", - 1234567890, + 11234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,234567890, -1.2345e67, 3.1415926535897932384626433832795028841971, True, False, None, - "\u0041\u0042\u0043\u00A9\u20AC", + "\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC", "mix\\n\\t\\r\\\\\\\"end" ] } diff --git a/hex.go b/hex.go new file mode 100644 index 0000000..8fa4a59 --- /dev/null +++ b/hex.go @@ -0,0 +1,54 @@ +package libjson + +import "errors" + +var invalid_hex_err = errors.New("invalid hex") + +var hexTable [256]byte + +func init() { + for i := 0; i < 256; i++ { + hexTable[i] = 0xFF + } + for i := byte('0'); i <= '9'; i++ { + hexTable[i] = i - '0' + } + for i := byte('a'); i <= 'f'; i++ { + hexTable[i] = i - 'a' + 10 + } + for i := byte('A'); i <= 'F'; i++ { + hexTable[i] = i - 'A' + 10 + } +} + +// hex4 converts 4 ASCII hex bytes to a rune. +// Returns an error if any byte is invalid. +func hex4(b []byte) (r rune, err error) { + var v byte + + v = hexTable[b[0]] + if v == 0xFF { + return 0, invalid_hex_err + } + r = rune(v) << 12 + + v = hexTable[b[1]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 8 + + v = hexTable[b[2]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 4 + + v = hexTable[b[3]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) + + return r, nil +} diff --git a/parser.go b/parser.go index f759e60..ce0a750 100644 --- a/parser.go +++ b/parser.go @@ -163,23 +163,7 @@ func (p *parser) array() ([]any, error) { return a, p.advance() } -func hex4(b []byte) (r rune, err error) { - r = 0 - for _, c := range b { - r <<= 4 - switch { - case '0' <= c && c <= '9': - r += rune(c - '0') - case 'a' <= c && c <= 'f': - r += rune(c - 'a' + 10) - case 'A' <= c && c <= 'F': - r += rune(c - 'A' + 10) - default: - return 0, fmt.Errorf("invalid hex %q", c) - } - } - return r, nil -} +var badEscapeErr = errors.New("bad escape") // unescapes JSON escapes in a buffer into their non-JSON representation // @@ -200,7 +184,7 @@ func unescapeInPlace(in []byte) (int, error) { // check if there’s at least 1 more byte for the escape if i+1 >= len(in) { - return 0, errors.New("unterminated escape") + return 0, badEscapeErr } i++ // skip \ b = in[i] @@ -238,7 +222,7 @@ func unescapeInPlace(in []byte) (int, error) { // decoding if i+4 >= len(in) { - return 0, errors.New("unterminated unicode escape") + return 0, badEscapeErr } r, err := hex4(in[i+1 : i+5]) From 79124c1badd70edb3790f05739004f95298ac2f8 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:16:16 +0100 Subject: [PATCH 14/15] parser: replace parseFloat with strconv.ParseFloat due to it being as fast but more correct --- parser.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/parser.go b/parser.go index ce0a750..a247c1d 100644 --- a/parser.go +++ b/parser.go @@ -3,6 +3,7 @@ package libjson import ( "errors" "fmt" + "strconv" "unicode/utf8" "unsafe" ) @@ -59,16 +60,16 @@ func (p *parser) object() (map[string]any, error) { return nil, err } - m := make(map[string]any) - if p.cur_tok.Type == t_right_curly { err := p.advance() if err != nil { return nil, err } - return m, nil + return make(map[string]any, 0), nil } + m := make(map[string]any, 8) + for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_curly { if len(m) > 0 { if p.cur_tok.Type != t_comma { @@ -137,7 +138,7 @@ func (p *parser) array() ([]any, error) { return []any{}, p.advance() } - a := make([]any, 0) + a := make([]any, 0, 8) for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_braket { if len(a) > 0 { @@ -251,7 +252,7 @@ func (p *parser) atom() (any, error) { r = *(*string)(unsafe.Pointer(&in)) case t_number: raw := p.input[p.cur_tok.Start:p.cur_tok.End] - number, err := parseFloat(raw) + number, err := strconv.ParseFloat(*(*string)(unsafe.Pointer(&raw)), 64) if err != nil { return nil, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) } From f456fe22a81d619a117646bd419f186c3c51dec7 Mon Sep 17 00:00:00 2001 From: xnacly <47723417+xNaCly@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:10:01 +0100 Subject: [PATCH 15/15] parser+object: change internal JSON object representation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit replaces the need for hashing json object keys at parse time by replacing the previously used map[string]any with the new obj struct: | Benchmark | LibJson B/op | EncodingJson B/op | LibJson x Less Memory | LibJson Allocs | EncodingJson Allocs | LibJson x Fewer Allocs | | --------- | ------------ | ----------------- | --------------------- | -------------- | ------------------- | ---------------------- | | Naive | 29,632,671 | 42,744,497 | 1.44x | 450,023 | 1,050,031 | 2.33x | | Escaped | 22,471,438 | 37,544,412 | 1.67x | 350,023 | 1,100,030 | 3.14x | | Hard | 121,444,318 | 173,944,500 | 1.43x | 1,400,023 | 3,000,032 | 2.14x | These changes result in a ~10-15% speedup and allows libjson to hit the ~2x faster than encoding/json milestone. For instance with 1MB, 5MB, 10MB and 100MB sized files filled with: { "id": 12345, "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", "description": "This string contains\nmultiple\nlines\nand \"quotes\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"", "nested": { "level1": { "level2": { "level3": { "level4": { "array": [ "short", "string_with_escape\\n", "another\\tvalue", "unicode\u2603", "escaped_quote_\"_and_backslash_\\", 11234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,234567890, -1.2345e67, 3.1415926535897932384626433832795028841971, True, False, None, "\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC", "mix\\n\\t\\r\\\\\\\"end" ] } } } } } } libjson now outperforms encoding/json: $ cd ./benchmarks $ ./bench.sh | rg "faster" 1.72 ± 0.15 times faster than ./test -s -libjson=false ./1MB.json 1.89 ± 0.11 times faster than ./test -s -libjson=false ./5MB.json 1.90 ± 0.06 times faster than ./test -s -libjson=false ./10MB.json 1.95 ± 0.05 times faster than ./test -s -libjson=false ./100MB.json --- float.go | 92 -------------------------------------------------- object.go | 50 +++++++++++++++++++++++---- parser.go | 61 ++++++++++++++++----------------- parser_test.go | 10 +++--- tokens.go | 2 +- 5 files changed, 79 insertions(+), 136 deletions(-) delete mode 100644 float.go diff --git a/float.go b/float.go deleted file mode 100644 index 81f1be5..0000000 --- a/float.go +++ /dev/null @@ -1,92 +0,0 @@ -package libjson - -import ( - "errors" -) - -func pow10(exp int) float64 { - res := 1.0 - if exp > 0 { - for i := 0; i < exp; i++ { - res *= 10 - } - } else { - for i := 0; i < -exp; i++ { - res /= 10 - } - } - return res -} - -// non allocating float parsing -func parseFloat(input []byte) (float64, error) { - if len(input) == 0 { - return 0, errors.New("empty input") - } - - pos := 0 - neg := false - if input[pos] == '-' { - neg = true - pos++ - } - - mantissa := uint64(0) - exponent := 0 - seenDot := false - - for pos < len(input) { - c := input[pos] - if c >= '0' && c <= '9' { - mantissa = mantissa*10 + uint64(c-'0') - if seenDot { - exponent-- - } - pos++ - } else if c == '.' { - if seenDot { - return 0, errors.New("multiple dots in number") - } - seenDot = true - pos++ - } else { - break - } - } - - // weird eE+- handling - if pos < len(input) && (input[pos] == 'e' || input[pos] == 'E') { - pos++ - expNeg := false - if pos < len(input) && input[pos] == '-' { - expNeg = true - pos++ - } else if pos < len(input) && input[pos] == '+' { - pos++ - } - - if pos >= len(input) || input[pos] < '0' || input[pos] > '9' { - return 0, errors.New("missing digits in exponent") - } - - expVal := 0 - for pos < len(input) && input[pos] >= '0' && input[pos] <= '9' { - expVal = expVal*10 + int(input[pos]-'0') - pos++ - } - if expNeg { - expVal = -expVal - } - exponent += expVal - } - - if mantissa == 0 { - return 0, nil - } - - result := float64(mantissa) * pow10(exponent) - if neg { - result = -result - } - return result, nil -} diff --git a/object.go b/object.go index 4e59837..83ee480 100644 --- a/object.go +++ b/object.go @@ -8,7 +8,28 @@ import ( ) type JSON struct { - obj any + inner any +} + +// takes a JSON.inner value and converts it to Go, for instance merges the obj +// fields into a map +func toGo(json any) any { + switch v := json.(type) { + case obj: + m := make(map[string]any, len(v.Fields)) + for _, f := range v.Fields { + m[f.Key] = toGo(f.Value) + } + return m + case []any: + arr := make([]any, len(v)) + for i, el := range v { + arr[i] = toGo(el) + } + return arr + default: + return v + } } func Get[T any](obj *JSON, path string) (T, error) { @@ -17,6 +38,15 @@ func Get[T any](obj *JSON, path string) (T, error) { var e T return e, err } + + // normalise inner json representation into something Go can deal with + val = toGo(val) + + if val == nil { + var e T + return e, nil + } + if castVal, ok := val.(T); !ok { var e T return e, fmt.Errorf("Expected value of type %T, got type %T", e, val) @@ -42,14 +72,22 @@ func indexByKey(data any, key any) (any, error) { } else { return v[k], nil } - case map[string]any: - if len(v) == 0 { + case obj: + if len(v.Fields) == 0 { return nil, nil } + if k, ok := key.(string); !ok { return nil, fmt.Errorf("Can not use %T::%v to index into %T::%v", key, key, data, data) } else { - return v[k], nil + i := 0 + for ; i < len(v.Fields); i++ { + cur := v.Fields[i] + if cur.Key == k { + return cur.Value, nil + } + } + return nil, nil } default: return nil, fmt.Errorf("Unsupported %T, can not index", data) @@ -107,9 +145,9 @@ func (j *JSON) get(path string) (any, error) { if err != nil { return nil, fmt.Errorf("%w: %q", errors.ErrUnsupported, path) } - return f(j.obj) + return f(j.inner) } func (j *JSON) MarshalJSON() ([]byte, error) { - return json.Marshal(j.obj) + return json.Marshal(toGo(j.inner)) } diff --git a/parser.go b/parser.go index a247c1d..cf0c955 100644 --- a/parser.go +++ b/parser.go @@ -51,78 +51,75 @@ func (p *parser) expression() (any, error) { } } -func (p *parser) object() (map[string]any, error) { +type field struct { + Key string + Value any +} + +type obj struct { + Fields []field +} + +var emptyObj = obj{} + +func (p *parser) object() (obj, error) { if p.cur_tok.Type != t_left_curly { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_left_curly]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_left_curly]) } err := p.advance() if err != nil { - return nil, err + return emptyObj, err } if p.cur_tok.Type == t_right_curly { - err := p.advance() - if err != nil { - return nil, err - } - return make(map[string]any, 0), nil + return emptyObj, p.advance() } - m := make(map[string]any, 8) + m := obj{ + Fields: make([]field, 0, 8), + } for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_curly { - if len(m) > 0 { + if len(m.Fields) > 0 { if p.cur_tok.Type != t_comma { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_comma]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_comma]) } err := p.advance() if err != nil { - return nil, err + return emptyObj, err } } if p.cur_tok.Type != t_string { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_string]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_string]) } in := p.input[p.cur_tok.Start:p.cur_tok.End] key := *(*string)(unsafe.Pointer(&in)) err := p.advance() if err != nil { - return nil, err + return emptyObj, err } if p.cur_tok.Type != t_colon { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_colon]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_colon]) } err = p.advance() if err != nil { - return nil, err + return emptyObj, err } val, err := p.expression() if err != nil { - return nil, err + return emptyObj, err } - // TODO: think about activating a uniqueness check for object keys, - // would add an other hashing and a branch for each object key parsed. - // - // if _, ok := m[key]; ok { - // return nil, fmt.Errorf("Key %q is already set in this object", key) - // } - - m[key] = val + m.Fields = append(m.Fields, field{key, val}) } if p.cur_tok.Type != t_right_curly { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_right_curly]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_right_curly]) } - err = p.advance() - if err != nil { - return nil, err - } - - return m, nil + return m, p.advance() } func (p *parser) array() ([]any, error) { diff --git a/parser_test.go b/parser_test.go index 30731f5..9a84840 100644 --- a/parser_test.go +++ b/parser_test.go @@ -33,7 +33,7 @@ func TestParserAtoms(t *testing.T) { p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -57,7 +57,7 @@ func TestParserArray(t *testing.T) { p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -85,7 +85,7 @@ func TestParserObject(t *testing.T) { p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -113,7 +113,7 @@ func TestParserEdge(t *testing.T) { p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -148,7 +148,7 @@ func TestParserFail(t *testing.T) { p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.Error(t, err) - assert.Nil(t, out) + assert.Nil(t, toGo(out)) }) } } diff --git a/tokens.go b/tokens.go index 86ff403..5a5f70d 100644 --- a/tokens.go +++ b/tokens.go @@ -1,7 +1,7 @@ package libjson // json type -type t_json int32 +type t_json int8 type token struct { Type t_json