diff --git a/.gitignore b/.gitignore index 57edc61..ead7226 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -test/*.json -test/test -test/*.pprof +benchmarks/*.json +benchmarks/test +*.pprof diff --git a/README.md b/README.md index 173bc53..382c944 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # libjson -> WARNING: libjson is currently a work in progress :) - Fast and minimal JSON parser written in and for Go with a JIT query language ```go @@ -13,16 +11,17 @@ import ( func main() { input := `{ "hello": {"world": ["hi"] } }` - jsonObj, _ := New(input) // or libjson.NewReader(r io.Reader) + jsonObj, _ := libjson.New([]byte(input)) // or libjson.NewReader(r io.Reader) // accessing values - fmt.Println(Get[string](jsonObj, ".hello.world.0")) // hi, nil + fmt.Println(libjson.Get[string](jsonObj, ".hello.world.0")) // hi, nil } ``` ## Features -- [ECMA 404](https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf) +- Parser consumes and mutates the input to make most operations zero copy and zero alloc +- [ECMA 404](https://ecma-international.org/publications-and-standards/standards/ecma-404/) and [rfc8259](https://www.rfc-editor.org/rfc/rfc8259) compliant - tests against [JSONTestSuite](https://github.com/nst/JSONTestSuite), see [Parsing JSON is a Minefield @@ -35,6 +34,15 @@ func main() { - caching of queries with `libjson.Compile`, just in time caching of queries - serialisation via `json.Marshal` +## Why is it faster than encoding/json? + +- zero-copy strings +- mutate input for string escaping instead of allocating a new one +- no allocations for strings, views into the original input +- no reflection +- no copies for map keys +- very simple lexer and parser + ## Benchmarks ![libjson-vs-encodingjson](https://github.com/user-attachments/assets/b11bcce4-e7db-4c45-ab42-45a2042e2a51) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh new file mode 100755 index 0000000..0787939 --- /dev/null +++ b/benchmarks/bench.sh @@ -0,0 +1,15 @@ +#!/bin/bash +echo "generating example data" +python3 gen.py + +echo "building executable" +rm ./test +go build -o ./test ../cmd/lj.go + +for SIZE in 1MB 5MB 10MB 100MB; do + hyperfine \ + --warmup 1 \ + --runs 10 \ + "./test -s ./${SIZE}.json" \ + "./test -s -libjson=false ./${SIZE}.json" +done diff --git a/benchmarks/gen.py b/benchmarks/gen.py new file mode 100644 index 0000000..f169beb --- /dev/null +++ b/benchmarks/gen.py @@ -0,0 +1,47 @@ +from os.path import exists +import math +import json + +sizes =[1,5,10,100] + +line = json.dumps({ + "id": 12345, + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"", + "nested": { + "level1": { + "level2": { + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 11234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,234567890, + -1.2345e67, + 3.1415926535897932384626433832795028841971, + True, + False, + None, + "\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } + } + } + } +}) + +def write_data(size: int): + name = f"{size}MB.json" + if not exists(name): + with open(name, mode="w", encoding="utf8") as f: + f.write("[\n") + size = math.floor((size*1000000)/len(line)) + f.write(",\n".join([line for _ in range(0, size)])) + f.write("\n]") + +[write_data(size) for size in sizes] diff --git a/cmd/lj.go b/cmd/lj.go index f87b538..c65f6fc 100644 --- a/cmd/lj.go +++ b/cmd/lj.go @@ -1,9 +1,14 @@ package main import ( + "encoding/json" + "flag" "fmt" "log" "os" + "path/filepath" + "runtime/debug" + "runtime/pprof" "github.com/xnacly/libjson" ) @@ -16,17 +21,69 @@ func Must[T any](t T, err error) T { } func main() { - args := os.Args + noGc := flag.Bool("nogc", false, "disable the go garbage collector") + useLibjson := flag.Bool("libjson", true, "use libjson, if false use encoding/json") + usePprof := flag.Bool("pprof", false, "use pprof cpu tracing") + query := flag.String("q", ".", "query the parsed json") + silent := flag.Bool("s", false, "no stdoutput") + escape := flag.Bool("e", false, "escapes input with Gos '%#+v'") + flag.Parse() + + if *noGc { + debug.SetGCPercent(-1) + } + + args := flag.Args() + + var filePath string var file *os.File if info, err := os.Stdin.Stat(); err != nil || info.Mode()&os.ModeCharDevice != 0 { // we are in a pipe - if len(args) == 1 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") + if len(args) == 0 { + log.Fatalln("Wanted a file as an argument, got nothing, exiting") } - file = Must(os.Open(args[1])) + filePath = args[0] + file = Must(os.Open(filePath)) } else { file = os.Stdin + filePath = "stdin" + } + + if *usePprof { + f, err := os.Create(filepath.Base(filePath) + ".pprof") + if err != nil { + panic(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + + if *useLibjson { + out := Must(libjson.NewReader(file)) + if !*silent { + out := Must(libjson.Get[any](&out, *query)) + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } + } + } else { + if *query != "." { + panic("With -libjson=false, there is no support for querying the json") + } + + decoder := json.NewDecoder(file) + var out any + if err := decoder.Decode(&out); err != nil { + panic(err) + } + + if !*silent { + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } + } } - query := os.Args[len(os.Args)-1] - json := Must(libjson.NewReader(file)) - fmt.Printf("%+#v\n", Must(libjson.Get[any](&json, query))) } diff --git a/float.go b/float.go deleted file mode 100644 index 81f1be5..0000000 --- a/float.go +++ /dev/null @@ -1,92 +0,0 @@ -package libjson - -import ( - "errors" -) - -func pow10(exp int) float64 { - res := 1.0 - if exp > 0 { - for i := 0; i < exp; i++ { - res *= 10 - } - } else { - for i := 0; i < -exp; i++ { - res /= 10 - } - } - return res -} - -// non allocating float parsing -func parseFloat(input []byte) (float64, error) { - if len(input) == 0 { - return 0, errors.New("empty input") - } - - pos := 0 - neg := false - if input[pos] == '-' { - neg = true - pos++ - } - - mantissa := uint64(0) - exponent := 0 - seenDot := false - - for pos < len(input) { - c := input[pos] - if c >= '0' && c <= '9' { - mantissa = mantissa*10 + uint64(c-'0') - if seenDot { - exponent-- - } - pos++ - } else if c == '.' { - if seenDot { - return 0, errors.New("multiple dots in number") - } - seenDot = true - pos++ - } else { - break - } - } - - // weird eE+- handling - if pos < len(input) && (input[pos] == 'e' || input[pos] == 'E') { - pos++ - expNeg := false - if pos < len(input) && input[pos] == '-' { - expNeg = true - pos++ - } else if pos < len(input) && input[pos] == '+' { - pos++ - } - - if pos >= len(input) || input[pos] < '0' || input[pos] > '9' { - return 0, errors.New("missing digits in exponent") - } - - expVal := 0 - for pos < len(input) && input[pos] >= '0' && input[pos] <= '9' { - expVal = expVal*10 + int(input[pos]-'0') - pos++ - } - if expNeg { - expVal = -expVal - } - exponent += expVal - } - - if mantissa == 0 { - return 0, nil - } - - result := float64(mantissa) * pow10(exponent) - if neg { - result = -result - } - return result, nil -} diff --git a/go.mod b/go.mod index 3d85827..2724904 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/xnacly/libjson -go 1.23.0 +go 1.26.0 require github.com/stretchr/testify v1.9.0 diff --git a/hex.go b/hex.go new file mode 100644 index 0000000..8fa4a59 --- /dev/null +++ b/hex.go @@ -0,0 +1,54 @@ +package libjson + +import "errors" + +var invalid_hex_err = errors.New("invalid hex") + +var hexTable [256]byte + +func init() { + for i := 0; i < 256; i++ { + hexTable[i] = 0xFF + } + for i := byte('0'); i <= '9'; i++ { + hexTable[i] = i - '0' + } + for i := byte('a'); i <= 'f'; i++ { + hexTable[i] = i - 'a' + 10 + } + for i := byte('A'); i <= 'F'; i++ { + hexTable[i] = i - 'A' + 10 + } +} + +// hex4 converts 4 ASCII hex bytes to a rune. +// Returns an error if any byte is invalid. +func hex4(b []byte) (r rune, err error) { + var v byte + + v = hexTable[b[0]] + if v == 0xFF { + return 0, invalid_hex_err + } + r = rune(v) << 12 + + v = hexTable[b[1]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 8 + + v = hexTable[b[2]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 4 + + v = hexTable[b[3]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) + + return r, nil +} diff --git a/json.go b/json.go index 8477a8c..c433ffb 100644 --- a/json.go +++ b/json.go @@ -9,7 +9,7 @@ func NewReader(r io.Reader) (JSON, error) { if err != nil { return JSON{}, err } - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err @@ -17,8 +17,9 @@ func NewReader(r io.Reader) (JSON, error) { return JSON{obj}, nil } +// data is consumed and possibly mutated, DO NOT REUSE func New(data []byte) (JSON, error) { - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err diff --git a/json_test.go b/json_test.go index f7af6c3..85ccdd2 100644 --- a/json_test.go +++ b/json_test.go @@ -9,31 +9,88 @@ import ( ) const amount = 50_000 +const naiveInput = `{"key1":"value","array":[],"obj":{},"atomArray":[11201,1e112,true,false,null,"str"]},` +const escapedInput = `{"text":"line1\nline2\nline3","quote":"\"hello\"","path":"C:\\\\Users\\\\name","unicode":"\u0041\u0042\u0043","mix":"abc\\ndef\"ghi\u263A"},` +const hardInput = `{ + "id": 12345, + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode \u2764\u2764\u2764", + "nested": { + "level1": { + "level2": { + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 1234567890, + -1.2345e67, + 3.141592653589793, + true, + false, + null, + "ABC\u00a9\u20ac", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } + } + } + } +},` -func BenchmarkLibJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := New(d) + buf := make([]byte, len(d)) + copy(buf, d) + b.StartTimer() + _, err := New(buf) + b.StopTimer() assert.NoError(b, err) } b.ReportAllocs() } -func BenchmarkEncodingJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkEncodingJsonWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} + var v any err := json.Unmarshal(d, &v) assert.NoError(b, err) } b.ReportAllocs() } + +func BenchmarkLibJson_Naive(b *testing.B) { + benchmarkWithInput(b, naiveInput) +} + +func BenchmarkLibJson_Escaped(b *testing.B) { + benchmarkWithInput(b, escapedInput) +} + +func BenchmarkLibJson_Hard(b *testing.B) { + benchmarkWithInput(b, hardInput) +} + +func BenchmarkEncodingJson_Naive(b *testing.B) { + benchmarkEncodingJsonWithInput(b, naiveInput) +} + +func BenchmarkEncodingJson_Escaped(b *testing.B) { + benchmarkEncodingJsonWithInput(b, escapedInput) +} + +func BenchmarkEncodingJson_Hard(b *testing.B) { + benchmarkEncodingJsonWithInput(b, hardInput) +} diff --git a/lexer.go b/lexer.go index e3888d7..d49434b 100644 --- a/lexer.go +++ b/lexer.go @@ -9,6 +9,7 @@ import ( type lexer struct { data []byte pos int + len int } var numChar [256]bool @@ -25,7 +26,7 @@ func init() { } func (l *lexer) next() (token, error) { - for l.pos < len(l.data) { + for l.pos < l.len { cc := l.data[l.pos] if cc == ' ' || cc == '\n' || cc == '\t' || cc == '\r' { l.pos++ @@ -34,7 +35,7 @@ func (l *lexer) next() (token, error) { } } - if l.pos >= len(l.data) { + if l.pos >= l.len { return empty, nil } @@ -57,16 +58,33 @@ func (l *lexer) next() (token, error) { tt = t_colon case '"': start := l.pos - for i := start; i < len(l.data); i++ { - if l.data[i] == '"' { + for i := start; i < l.len; i++ { + if c := l.data[i]; c == '"' { t := token{Type: t_string, Start: start, End: i} l.pos = i + 1 return t, nil + } else if c == '\\' { // OH NO ITS ESCAPING :O + i++ + if i >= l.len { + return empty, errors.New("Unterminated string escape") + } + + switch l.data[i] { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + // we simply skip the escaped char, the parser has to + case 'u': + if i+4 > l.len { + return empty, errors.New("Unterminated string") + } + i += 4 + default: + return empty, fmt.Errorf("Invalid escape %q", l.data[i]) + } } } return empty, errors.New("Unterminated string") case 't': // this should always be the 'true' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'true' atom") } if !(l.data[l.pos] == 'r' && l.data[l.pos+1] == 'u' && l.data[l.pos+2] == 'e') { @@ -75,7 +93,7 @@ func (l *lexer) next() (token, error) { l.pos += 3 tt = t_true case 'f': // this should always be the 'false' atom and is therefore optimised here - if l.pos+4 > len(l.data) { + if l.pos+4 > l.len { return empty, errors.New("Failed to read the expected 'false' atom") } if !(l.data[l.pos] == 'a' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 's' && l.data[l.pos+3] == 'e') { @@ -84,7 +102,7 @@ func (l *lexer) next() (token, error) { l.pos += 4 tt = t_false case 'n': // this should always be the 'null' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'null' atom") } if !(l.data[l.pos] == 'u' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 'l') { @@ -95,7 +113,7 @@ func (l *lexer) next() (token, error) { default: if cc == '-' || (cc >= '0' && cc <= '9') { start := l.pos - 1 - for l.pos < len(l.data) && numChar[l.data[l.pos]] { + for l.pos < l.len && numChar[l.data[l.pos]] { l.pos++ } @@ -115,6 +133,7 @@ func (l *lexer) lex(r io.Reader) ([]token, error) { if err != nil { return nil, err } + l.len = len(l.data) toks := make([]token, 0, len(l.data)/2) for { diff --git a/object.go b/object.go index 4e59837..83ee480 100644 --- a/object.go +++ b/object.go @@ -8,7 +8,28 @@ import ( ) type JSON struct { - obj any + inner any +} + +// takes a JSON.inner value and converts it to Go, for instance merges the obj +// fields into a map +func toGo(json any) any { + switch v := json.(type) { + case obj: + m := make(map[string]any, len(v.Fields)) + for _, f := range v.Fields { + m[f.Key] = toGo(f.Value) + } + return m + case []any: + arr := make([]any, len(v)) + for i, el := range v { + arr[i] = toGo(el) + } + return arr + default: + return v + } } func Get[T any](obj *JSON, path string) (T, error) { @@ -17,6 +38,15 @@ func Get[T any](obj *JSON, path string) (T, error) { var e T return e, err } + + // normalise inner json representation into something Go can deal with + val = toGo(val) + + if val == nil { + var e T + return e, nil + } + if castVal, ok := val.(T); !ok { var e T return e, fmt.Errorf("Expected value of type %T, got type %T", e, val) @@ -42,14 +72,22 @@ func indexByKey(data any, key any) (any, error) { } else { return v[k], nil } - case map[string]any: - if len(v) == 0 { + case obj: + if len(v.Fields) == 0 { return nil, nil } + if k, ok := key.(string); !ok { return nil, fmt.Errorf("Can not use %T::%v to index into %T::%v", key, key, data, data) } else { - return v[k], nil + i := 0 + for ; i < len(v.Fields); i++ { + cur := v.Fields[i] + if cur.Key == k { + return cur.Value, nil + } + } + return nil, nil } default: return nil, fmt.Errorf("Unsupported %T, can not index", data) @@ -107,9 +145,9 @@ func (j *JSON) get(path string) (any, error) { if err != nil { return nil, fmt.Errorf("%w: %q", errors.ErrUnsupported, path) } - return f(j.obj) + return f(j.inner) } func (j *JSON) MarshalJSON() ([]byte, error) { - return json.Marshal(j.obj) + return json.Marshal(toGo(j.inner)) } diff --git a/parser.go b/parser.go index 98faa54..cf0c955 100644 --- a/parser.go +++ b/parser.go @@ -1,7 +1,10 @@ package libjson import ( + "errors" "fmt" + "strconv" + "unicode/utf8" "unsafe" ) @@ -48,78 +51,75 @@ func (p *parser) expression() (any, error) { } } -func (p *parser) object() (map[string]any, error) { +type field struct { + Key string + Value any +} + +type obj struct { + Fields []field +} + +var emptyObj = obj{} + +func (p *parser) object() (obj, error) { if p.cur_tok.Type != t_left_curly { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_left_curly]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_left_curly]) } err := p.advance() if err != nil { - return nil, err + return emptyObj, err } - m := make(map[string]any, 4) - if p.cur_tok.Type == t_right_curly { - err := p.advance() - if err != nil { - return nil, err - } - return m, nil + return emptyObj, p.advance() + } + + m := obj{ + Fields: make([]field, 0, 8), } for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_curly { - if len(m) > 0 { + if len(m.Fields) > 0 { if p.cur_tok.Type != t_comma { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_comma]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_comma]) } err := p.advance() if err != nil { - return nil, err + return emptyObj, err } } if p.cur_tok.Type != t_string { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_string]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_string]) } in := p.input[p.cur_tok.Start:p.cur_tok.End] key := *(*string)(unsafe.Pointer(&in)) err := p.advance() if err != nil { - return nil, err + return emptyObj, err } if p.cur_tok.Type != t_colon { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_colon]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_colon]) } err = p.advance() if err != nil { - return nil, err + return emptyObj, err } val, err := p.expression() if err != nil { - return nil, err + return emptyObj, err } - // TODO: think about activating a uniqueness check for object keys, - // would add an other hashing and a branch for each object key parsed. - // - // if _, ok := m[key]; ok { - // return nil, fmt.Errorf("Key %q is already set in this object", key) - // } - - m[key] = val + m.Fields = append(m.Fields, field{key, val}) } if p.cur_tok.Type != t_right_curly { - return nil, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_right_curly]) + return emptyObj, fmt.Errorf("Unexpected %q at this position, expected %q", tokennames[p.cur_tok.Type], tokennames[t_right_curly]) } - err = p.advance() - if err != nil { - return nil, err - } - - return m, nil + return m, p.advance() } func (p *parser) array() ([]any, error) { @@ -161,17 +161,97 @@ func (p *parser) array() ([]any, error) { return a, p.advance() } +var badEscapeErr = errors.New("bad escape") + +// unescapes JSON escapes in a buffer into their non-JSON representation +// +// Returns the end of the in place escaped buffer so the caller can resize to +// the new, smaller buffer size +// +// The implementation may look weird, but is optimised to have the least +// possible branches +func unescapeInPlace(in []byte) (int, error) { + curEnd := 0 + for i := 0; i < len(in); i++ { + b := in[i] + if b != '\\' { + in[curEnd] = b + curEnd++ + continue + } + + // check if there’s at least 1 more byte for the escape + if i+1 >= len(in) { + return 0, badEscapeErr + } + i++ // skip \ + b = in[i] + + switch b { + case '"', '\\', '/': + in[curEnd] = b + curEnd++ + case 'b': + in[curEnd] = '\b' + curEnd++ + case 'f': + in[curEnd] = '\f' + curEnd++ + case 'n': + in[curEnd] = '\n' + curEnd++ + case 'r': + in[curEnd] = '\r' + curEnd++ + case 't': + in[curEnd] = '\t' + curEnd++ + case 'u': // \uXXXX + + // From ECMA-404: + // + // However, whether a processor of JSON texts interprets such a surrogate pair + // as a single code point or as an explicit surrogate pair is a semantic + // decision that is determined by the specific processor. + // + // meaning we dont merge unicode points, firstly because fuck + // utf16, and secondly because its simpler to just keep two unicode + // points separate compared to increasing the complexity of this + // decoding + + if i+4 >= len(in) { + return 0, badEscapeErr + } + + r, err := hex4(in[i+1 : i+5]) + if err != nil { + return 0, err + } + n := utf8.EncodeRune(in[curEnd:], r) + curEnd += n + i += 4 + } // we dont need a default case since we check all possible escapes in the lexer + } + + return curEnd, nil +} + func (p *parser) atom() (any, error) { var r any switch p.cur_tok.Type { case t_string: in := p.input[p.cur_tok.Start:p.cur_tok.End] + end, err := unescapeInPlace(in) + if err != nil { + return nil, err + } + in = in[:end] r = *(*string)(unsafe.Pointer(&in)) case t_number: raw := p.input[p.cur_tok.Start:p.cur_tok.End] - number, err := parseFloat(raw) + number, err := strconv.ParseFloat(*(*string)(unsafe.Pointer(&raw)), 64) if err != nil { - return empty, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) + return nil, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) } r = number case t_true: diff --git a/parser_test.go b/parser_test.go index 907473b..9a84840 100644 --- a/parser_test.go +++ b/parser_test.go @@ -30,10 +30,10 @@ func TestParserAtoms(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -54,10 +54,10 @@ func TestParserArray(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -82,10 +82,10 @@ func TestParserObject(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -110,10 +110,10 @@ func TestParserEdge(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) - assert.EqualValues(t, wanted[i], out) + assert.EqualValues(t, wanted[i], toGo(out)) }) } } @@ -145,10 +145,10 @@ func TestParserFail(t *testing.T) { for _, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.Error(t, err) - assert.Nil(t, out) + assert.Nil(t, toGo(out)) }) } } diff --git a/test/bench.sh b/test/bench.sh deleted file mode 100755 index 9585625..0000000 --- a/test/bench.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -echo "generating example data" -python3 gen.py - -echo "building executable" -rm ./test -go build ./test.go - -hyperfine "./test ./1MB.json" "./test -libjson=false ./1MB.json" -hyperfine "./test ./5MB.json" "./test -libjson=false ./5MB.json" -hyperfine "./test ./10MB.json" "./test -libjson=false ./10MB.json" diff --git a/test/gen.py b/test/gen.py deleted file mode 100644 index 50d2bcb..0000000 --- a/test/gen.py +++ /dev/null @@ -1,22 +0,0 @@ -from os.path import exists -import math - -sizes =[1,5,10] - -line = """\t{ - "key1": "value", - "array": [], - "obj": {}, - "atomArray": [11201,1e112,true,false,null,"str"] - }""" - -def write_data(size: int): - name = f"{size}MB.json" - if not exists(name): - with open(name, mode="w", encoding="utf8") as f: - f.write("[\n") - size = math.floor((size*1000000)/len(line)) - f.write(",\n".join([line for _ in range(0, size)])) - f.write("\n]") - -[write_data(size) for size in sizes] diff --git a/test/test.go b/test/test.go deleted file mode 100644 index 3a6dfe0..0000000 --- a/test/test.go +++ /dev/null @@ -1,49 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "log" - "os" - - // "runtime/pprof" - - "github.com/xnacly/libjson" -) - -func main() { - // f, err := os.Create("cpu.pprof") - // if err != nil { - // panic(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - lj := flag.Bool("libjson", true, "benchmark libjson or gojson") - flag.Parse() - args := flag.Args() - if len(args) == 0 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") - } - file, err := os.Open(args[0]) - if err != nil { - log.Fatalln(err) - } - if *lj { - _, err := libjson.NewReader(file) - if err != nil { - log.Fatalln(err) - } - } else { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} - d := json.NewDecoder(file) - err := d.Decode(&v) - if err != nil { - log.Fatalln(err) - } - } -} diff --git a/types.go b/tokens.go similarity index 98% rename from types.go rename to tokens.go index 86ff403..5a5f70d 100644 --- a/types.go +++ b/tokens.go @@ -1,7 +1,7 @@ package libjson // json type -type t_json int32 +type t_json int8 type token struct { Type t_json