ruby · headius · Nov 5, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+ext/json/ext/parser/parser.c linguist-generated=true
+java/src/json/ext/Parser.java linguist-generated=true
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: 'github-actions'
+    directory: '/'
+    schedule:
+      interval: 'daily'
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -35,7 +35,7 @@ jobs:
         - { os: windows-latest, ruby: jruby-head }
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Set up Ruby
         uses: ruby/setup-ruby-pkgs@v1
@@ -64,7 +64,7 @@ jobs:
       fail-fast: false
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Set up Ruby
         uses: ruby/setup-ruby-pkgs@v1

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,12 +1,27 @@
 # Changes
 
-### UNRELEASED
+### 2024-11-14 (2.8.2)
+
+* `JSON.load_file` explictly read the file as UTF-8.
+
+### 2024-11-06 (2.8.1)
+
+* Fix the java packages to include the extension.
+
+### 2024-11-06 (2.8.0)
 
 * Emit a deprecation warning when `JSON.load` create custom types without the `create_additions` option being explictly enabled.
   * Prefer to use `JSON.unsafe_load(string)` or `JSON.load(string, create_additions: true)`.
 * Emit a deprecation warning when serializing valid UTF-8 strings encoded in `ASCII_8BIT` aka `BINARY`.
-* Bump required_ruby_version to 2.7.
-* More performance improvments to `JSON.dump` and `JSON.generate`.
+* Bump required Ruby version to 2.7.
+* Add support for optionally parsing trailing commas, via `allow_trailing_comma: true`, which in cunjunction with the
+  pre-existing support for comments, make it suitable to parse `jsonc` documents.
+* Many performance improvements to `JSON.parse` and `JSON.load`, up to `1.7x` faster on real world documents.
+* Some minor performance improvements to `JSON.dump` and `JSON.generate`.
+
+### 2024-11-04 (2.7.6)
+
+* Fix a regression in JSON.generate when dealing with Hash keys that are string subclasses, call `to_json` on them.
 
 ### 2024-10-25 (2.7.5)
 

diff --git a/Rakefile b/Rakefile
@@ -161,7 +161,7 @@ if defined?(RUBY_ENGINE) and RUBY_ENGINE == 'jruby'
   file JRUBY_PARSER_JAR => :compile do
     cd 'java/src' do
       parser_classes = FileList[
-        "json/ext/ByteListTranscoder*.class",
+        "json/ext/ByteList*.class",
         "json/ext/OptionsReader*.class",
         "json/ext/Parser*.class",
         "json/ext/RuntimeInfo*.class",
@@ -179,7 +179,7 @@ if defined?(RUBY_ENGINE) and RUBY_ENGINE == 'jruby'
   file JRUBY_GENERATOR_JAR => :compile do
     cd 'java/src' do
       generator_classes = FileList[
-        "json/ext/ByteListTranscoder*.class",
+        "json/ext/ByteList*.class",
         "json/ext/OptionsReader*.class",
         "json/ext/Generator*.class",
         "json/ext/RuntimeInfo*.class",

diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb
@@ -17,7 +17,6 @@
 def implementations(ruby_obj)
   state = JSON::State.new(JSON.dump_default_options)
   {
-    json_state: ["json (reuse)", proc { state.generate(ruby_obj) }],
     json: ["json", proc { JSON.generate(ruby_obj) }],
     oj: ["oj", proc { Oj.dump(ruby_obj) }],
   }
@@ -58,27 +57,24 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
 # NB: Notes are based on ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
 
 # On the first two micro benchmarks, the limitting factor is the fixed cost of initializing the
-# generator state. Since `JSON.generate` now lazily allocate the `State` object we're now ~10% faster
+# generator state. Since `JSON.generate` now lazily allocate the `State` object we're now ~10-20% faster
 # than `Oj.dump`.
 benchmark_encoding "small mixed", [1, "string", { a: 1, b: 2 }, [3, 4, 5]]
 benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
-
-# On small hash specifically, we're just on par with `Oj.dump`. Would be worth investigating why
-# Hash serialization doesn't perform as well as other types.
 benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
 
-# On string encoding we're ~20% faster when dealing with mostly ASCII, but ~10% slower when dealing
-# with mostly multi-byte characters. This is a tradeoff.
-benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500), except: %i(json_state)
-benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500), except: %i(json_state)
+# On string encoding we're ~20% faster when dealing with mostly ASCII, but ~50% slower when dealing
+# with mostly multi-byte characters. There's likely some gains left to be had in multi-byte handling.
+benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
+benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)
 
 # On these benchmarks we perform well, we're on par or better.
 benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
-benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json"), except: %i(json_state)
-benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json"), except: %i(json_state)
+benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
+benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")
 
-# On twitter.json we're still about 10% slower, this is worth investigating.
-benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
+# On twitter.json we're still about 6% slower, this is worth investigating.
+benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")
 
 # This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation
 # which uses a relatively old version of dtoa.c from David M. Gay.
@@ -89,8 +85,8 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
 # but all these are implemented in C++11 or newer, making it hard if not impossible to include them.
 # Short of a pure C99 implementation of these newer algorithms, there isn't much that can be done to match
 # Oj speed without losing precision.
-benchmark_encoding "canada.json", JSON.load_file("#{__dir__}/data/canada.json"), check_expected: false, except: %i(json_state)
+benchmark_encoding "canada.json", JSON.load_file("#{__dir__}/data/canada.json"), check_expected: false
 
 # We're about 10% faster when `to_json` calls are involved, but this wasn't particularly optimized, there might be
 # opportunities here.
-benchmark_encoding "many #to_json calls", [{object: Object.new, int: 12, float: 54.3, class: Float, time: Time.now, date: Date.today}] * 20, except: %i(json_state)
+benchmark_encoding "many #to_json calls", [{object: Object.new, int: 12, float: 54.3, class: Float, time: Time.now, date: Date.today}] * 20
diff --git a/benchmark/parser.rb b/benchmark/parser.rb
@@ -19,7 +19,7 @@ def benchmark_parsing(name, json_output)
   Benchmark.ips do |x|
     x.report("json")      { JSON.parse(json_output) } if RUN[:json]
     x.report("oj")        { Oj.load(json_output) } if RUN[:oj]
-    x.report("Oj::Parser") { Oj::Parser.usual.parse(json_output) } if RUN[:oj]
+    x.report("Oj::Parser") { Oj::Parser.new(:usual).parse(json_output) } if RUN[:oj]
     x.report("rapidjson") { RapidJSON.parse(json_output) } if RUN[:rapidjson]
     x.compare!(order: :baseline)
   end
@@ -28,27 +28,22 @@ def benchmark_parsing(name, json_output)
 
 # NB: Notes are based on ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
 
-# Oj::Parser is very significanly faster (1.80x) on the nested array benchmark.
 benchmark_parsing "small nested array", JSON.dump([[1,2,3,4,5]]*10)
-
-# Oj::Parser is significanly faster (~1.5x) on the next 4 benchmarks in large part because its
-# cache is persisted across calls. That's not something we can do with the current API, we'd
-# need to expose a stateful API as well, but that's no really desirable.
-# Other than that we're faster than regular `Oj.load` by a good margin.
 benchmark_parsing "small hash", JSON.dump({ "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" })
-
 benchmark_parsing "test from oj", <<JSON
-{"a":"Alpha","b":true,"c":12345,"d":[true,[false,[-123456789,null],3.9676,["Something else.",false],null]],"e":{"zero":null,"one":1,"two":2,"three":[3],"four":[0,1,2,3,4]},"f":null,"h":{"a":{"b":{"c":{"d":{"e":{"f":{"g":null}}}}}}},"i":[[[[[[[null]]]]]]]}
+{"a":"Alpha","b":true,"c":12345,"d":[true,[false,[-123456789,null],3.9676,["Something else.",false],null]],
+"e":{"zero":null,"one":1,"two":2,"three":[3],"four":[0,1,2,3,4]},"f":null,
+"h":{"a":{"b":{"c":{"d":{"e":{"f":{"g":null}}}}}}},"i":[[[[[[[null]]]]]]]}
 JSON
 
-# On these macro-benchmarks, we're on par with `Oj::Parser` and significantly
-# faster than `Oj.load`.
+# On these macro-benchmarks, we're on par with `Oj::Parser`, except `twitter.json` where we're `1.14x` faster,
+# And between 1.3x and 1.5x faster than `Oj.load`.
 benchmark_parsing "activitypub.json", File.read("#{__dir__}/data/activitypub.json")
 benchmark_parsing "twitter.json", File.read("#{__dir__}/data/twitter.json")
 benchmark_parsing "citm_catalog.json", File.read("#{__dir__}/data/citm_catalog.json")
 
-# rapidjson is 8x faster thanks to it's much more performant float parser.
+# rapidjson is 8x faster thanks to its much more performant float parser.
 # Unfortunately, there isn't a lot of existing fast float parsers in pure C,
 # and including C++ is problematic.
-# Aside from that, we're much faster than other alternatives here.
+# Aside from that, we're close to the alternatives here.
 benchmark_parsing "float parsing", File.read("#{__dir__}/data/canada.json")
diff --git a/ext/json/ext/fbuffer/fbuffer.h b/ext/json/ext/fbuffer/fbuffer.h
@@ -46,6 +46,7 @@ typedef struct FBufferStruct {
     unsigned long len;
     unsigned long capa;
     char *ptr;
+    VALUE io;
 } FBuffer;
 
 #define FBUFFER_STACK_SIZE 512
@@ -66,7 +67,7 @@ static void fbuffer_append_long(FBuffer *fb, long number);
 #endif
 static inline void fbuffer_append_char(FBuffer *fb, char newchr);
 #ifdef JSON_GENERATOR
-static VALUE fbuffer_to_s(FBuffer *fb);
+static VALUE fbuffer_finalize(FBuffer *fb);
 #endif
 
 static void fbuffer_stack_init(FBuffer *fb, unsigned long initial_length, char *stack_buffer, long stack_buffer_size)
@@ -86,15 +87,26 @@ static void fbuffer_free(FBuffer *fb)
     }
 }
 
-#ifndef JSON_GENERATOR
 static void fbuffer_clear(FBuffer *fb)
 {
     fb->len = 0;
 }
-#endif
+
+static void fbuffer_flush(FBuffer *fb)
+{
+    rb_io_write(fb->io, rb_utf8_str_new(fb->ptr, fb->len));
+    fbuffer_clear(fb);
+}
 
 static void fbuffer_do_inc_capa(FBuffer *fb, unsigned long requested)
 {
+    if (RB_UNLIKELY(fb->io)) {
+        fbuffer_flush(fb);
+        if (RB_LIKELY(requested < fb->capa)) {
+            return;
+        }
+    }
+
     unsigned long required;
 
     if (RB_UNLIKELY(!fb->ptr)) {
@@ -174,11 +186,18 @@ static void fbuffer_append_long(FBuffer *fb, long number)
     fbuffer_append(fb, buffer_end - len, len);
 }
 
-static VALUE fbuffer_to_s(FBuffer *fb)
+static VALUE fbuffer_finalize(FBuffer *fb)
 {
-    VALUE result = rb_utf8_str_new(FBUFFER_PTR(fb), FBUFFER_LEN(fb));
-    fbuffer_free(fb);
-    return result;
+    if (fb->io) {
+        fbuffer_flush(fb);
+        fbuffer_free(fb);
+        rb_io_flush(fb->io);
+        return fb->io;
+    } else {
+        VALUE result = rb_utf8_str_new(FBUFFER_PTR(fb), FBUFFER_LEN(fb));
+        fbuffer_free(fb);
+        return result;
+    }
 }
 #endif
 #endif
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ext/json/ext/parser/parser.c linguist-generated=true
		java/src/json/ext/Parser.java linguist-generated=true