diff --git a/README.md b/README.md index c74ba40..1aa59f6 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ It's based on [maxminddb-rust](https://github.com/oschwald/maxminddb-rust) imple You must create a copy if you wish to continue using the string when the database is closed. You'll need [MaxMind-DB/test-data](https://github.com/maxmind/MaxMind-DB/tree/main/test-data) -to run tests/examples and `GeoLite2-City.mmdb` to run the benchmark. +to run tests/examples and `GeoLite2-City.mmdb` to run the benchmarks. ```sh $ git submodule update --init @@ -45,9 +45,18 @@ See [examples](./examples/). ## Suggestions +Build the IPv4 index to speed up lookups with `.ipv4_index_first_n_bits` if you have a long-lived `Reader`. +The recommended value is 16 (~320KB fits L2 cache, ~1-4ms to build when warm +and ~10ms-120ms due to page faults) or 12 (~20KB) for constrained devices. + +```zig +var db = try maxminddb.Reader.mmap(allocator, db_path, .{ .ipv4_index_first_n_bits = 16 }); +defer db.close(); +``` + Use `ArenaAllocator` for best performance, see [benchmarks](./benchmarks/). -If you don't need all the fields, use `Options.only` to decode only the top-level fields you want. +If you don't need all the fields, use `.only` to decode only the top-level fields you want. ```zig const fields = &.{ "city", "country" }; @@ -78,13 +87,14 @@ if (result) |r| { } ``` -Here are reference results on Apple M2 Pro (1M random IPv4 lookups against GeoLite2-City): +Here are reference results on Apple M2 Pro (1M random IPv4 lookups against GeoLite2-City +with `ipv4_index_first_n_bits = 16`): | Benchmark | All fields | Filtered (city) | |--- |--- |--- | -| `geolite2.City` | ~1,189,000 | ~1,245,000 | -| `MyCity` | ~1,228,000 | — | -| `any.Value` | ~1,150,000 | ~1,234,000 | +| `geolite2.City` | ~1,284,000 | ~1,348,000 | +| `MyCity` | ~1,383,000 | — | +| `any.Value` | ~1,254,000 | ~1,349,000 |
@@ -103,27 +113,51 @@ $ for i in $(seq 1 10); do 2>&1 | grep 'Lookups Per Second' done -Lookups Per Second (avg):939020.9936331962 -Lookups Per Second (avg):1202068.1587479531 -Lookups Per Second (avg):1226191.8873913633 -Lookups Per Second (avg):1190260.5152708234 -Lookups Per Second (avg):1187237.1418382763 -Lookups Per Second (avg):1180139.664667138 -Lookups Per Second (avg):1184298.3951793911 -Lookups Per Second (avg):1172927.7709424824 -Lookups Per Second (avg):1192207.8482477544 -Lookups Per Second (avg):1182672.4879777646 +Lookups Per Second (avg):1181277.2875127245 +Lookups Per Second (avg):1298229.636700173 +Lookups Per Second (avg):1284580.6443966748 +Lookups Per Second (avg):1293284.3402910086 +Lookups Per Second (avg):1285891.7841541092 +Lookups Per Second (avg):1283654.9587741245 +Lookups Per Second (avg):1287798.220295312 +Lookups Per Second (avg):1291991.2632139924 +Lookups Per Second (avg):1282363.8582417285 +Lookups Per Second (avg):1246191.3914272592 --- -Lookups Per Second (avg):1255008.2012150432 -Lookups Per Second (avg):1244663.9575842023 -Lookups Per Second (avg):1255868.10809833 -Lookups Per Second (avg):1244955.1445213587 -Lookups Per Second (avg):1221882.1368531892 -Lookups Per Second (avg):1255099.9559031925 -Lookups Per Second (avg):1251926.597665689 -Lookups Per Second (avg):1221997.1083589145 -Lookups Per Second (avg):1186516.0167055523 -Lookups Per Second (avg):1226974.481844842 +Lookups Per Second (avg):1323980.8070552205 +Lookups Per Second (avg):1351732.5910886768 +Lookups Per Second (avg):1351039.987754606 +Lookups Per Second (avg):1348480.894738865 +Lookups Per Second (avg):1357111.6649975393 +Lookups Per Second (avg):1348661.0150208646 +Lookups Per Second (avg):1357781.4722981465 +Lookups Per Second (avg):1356498.714039219 +Lookups Per Second (avg):1346452.11429767 +Lookups Per Second (avg):1315870.3443053183 +``` + +
+ +
+ +MyCity + +```sh +$ for i in $(seq 1 10); do + zig build benchmark_mycity -Doptimize=ReleaseFast -- GeoLite2-City.mmdb 1000000 \ + 2>&1 | grep 'Lookups Per Second' + done + +Lookups Per Second (avg):1405912.7999428671 +Lookups Per Second (avg):1376923.8357458028 +Lookups Per Second (avg):1372073.1321839818 +Lookups Per Second (avg):1378707.359082014 +Lookups Per Second (avg):1395492.1172529764 +Lookups Per Second (avg):1394880.1743390427 +Lookups Per Second (avg):1390645.867575583 +Lookups Per Second (avg):1373588.0075019994 +Lookups Per Second (avg):1372678.8857965483 +Lookups Per Second (avg):1387958.9236387985 ```
@@ -145,27 +179,27 @@ $ for i in $(seq 1 10); do 2>&1 | grep 'Lookups Per Second' done -Lookups Per Second (avg):975677.3396010846 -Lookups Per Second (avg):1140100.8142809793 -Lookups Per Second (avg):1148647.9154542664 -Lookups Per Second (avg):1159945.4593645008 -Lookups Per Second (avg):1146155.6701547962 -Lookups Per Second (avg):1152253.0540916577 -Lookups Per Second (avg):1168908.0392599553 -Lookups Per Second (avg):1138716.2824329527 -Lookups Per Second (avg):1150480.114967662 -Lookups Per Second (avg):1161504.7700823087 +Lookups Per Second (avg):1249814.6118740842 +Lookups Per Second (avg):1225988.817449499 +Lookups Per Second (avg):1264197.1313154744 +Lookups Per Second (avg):1270859.3015692532 +Lookups Per Second (avg):1261325.321815331 +Lookups Per Second (avg):1269464.4605490116 +Lookups Per Second (avg):1260642.9131866288 +Lookups Per Second (avg):1248199.6670115339 +Lookups Per Second (avg):1259984.7888336368 +Lookups Per Second (avg):1227344.2469651096 --- -Lookups Per Second (avg):1232606.0656379322 -Lookups Per Second (avg):1234686.4799143772 -Lookups Per Second (avg):1081398.2429103954 -Lookups Per Second (avg):1243047.4800630722 -Lookups Per Second (avg):1217435.2550309 -Lookups Per Second (avg):1237809.9577944186 -Lookups Per Second (avg):1232356.3798965935 -Lookups Per Second (avg):1242459.8219555076 -Lookups Per Second (avg):1213491.9682358333 -Lookups Per Second (avg):1241524.1410712942 +Lookups Per Second (avg):1366697.6894286321 +Lookups Per Second (avg):1359936.8717304142 +Lookups Per Second (avg):1350500.9773859177 +Lookups Per Second (avg):1345155.3802565804 +Lookups Per Second (avg):1354979.4314596548 +Lookups Per Second (avg):1363058.6900699302 +Lookups Per Second (avg):1351386.2025057953 +Lookups Per Second (avg):1360068.193819238 +Lookups Per Second (avg):1342324.820976454 +Lookups Per Second (avg):1315986.2950186788 ``` diff --git a/benchmarks/inspect.zig b/benchmarks/inspect.zig index bb07aec..597778d 100644 --- a/benchmarks/inspect.zig +++ b/benchmarks/inspect.zig @@ -35,8 +35,8 @@ pub fn main() !void { std.debug.print("Opening database...\n", .{}); var open_timer = try std.time.Timer.start(); - var db = try maxminddb.Reader.mmap(allocator, db_path); - defer db.unmap(); + var db = try maxminddb.Reader.mmap(allocator, db_path, .{ .ipv4_index_first_n_bits = 16 }); + defer db.close(); const open_time_ms = @as(f64, @floatFromInt(open_timer.read())) / @as(f64, @floatFromInt(std.time.ns_per_ms)); std.debug.print("Database opened successfully in {d} ms. Type: {s}\n", .{ diff --git a/benchmarks/lookup.zig b/benchmarks/lookup.zig index 64ebf4d..f260157 100644 --- a/benchmarks/lookup.zig +++ b/benchmarks/lookup.zig @@ -35,8 +35,8 @@ pub fn main() !void { std.debug.print("Opening database...\n", .{}); var open_timer = try std.time.Timer.start(); - var db = try maxminddb.Reader.mmap(allocator, db_path); - defer db.unmap(); + var db = try maxminddb.Reader.mmap(allocator, db_path, .{ .ipv4_index_first_n_bits = 16 }); + defer db.close(); const open_time_ms = @as(f64, @floatFromInt(open_timer.read())) / @as(f64, @floatFromInt(std.time.ns_per_ms)); std.debug.print("Database opened successfully in {d} ms. Type: {s}\n", .{ diff --git a/benchmarks/mycity.zig b/benchmarks/mycity.zig new file mode 100644 index 0000000..558ce7a --- /dev/null +++ b/benchmarks/mycity.zig @@ -0,0 +1,89 @@ +const std = @import("std"); +const maxminddb = @import("maxminddb"); + +const default_db_path: []const u8 = "GeoLite2-City.mmdb"; +const default_num_lookups: u64 = 1_000_000; + +const MyCity = struct { + city: struct { + names: struct { + en: []const u8 = "", + } = .{}, + } = .{}, +}; + +pub fn main() !void { + const allocator = std.heap.smp_allocator; + + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + + var db_path: []const u8 = default_db_path; + var num_lookups = default_num_lookups; + if (args.len > 1) db_path = args[1]; + if (args.len > 2) num_lookups = try std.fmt.parseUnsigned(u64, args[2], 10); + + std.debug.print("Benchmarking with:\n", .{}); + std.debug.print(" Database: {s}\n", .{db_path}); + std.debug.print(" Lookups: {d}\n", .{num_lookups}); + std.debug.print("Opening database...\n", .{}); + + var open_timer = try std.time.Timer.start(); + var db = try maxminddb.Reader.mmap(allocator, db_path, .{ .ipv4_index_first_n_bits = 16 }); + defer db.close(); + const open_time_ms = @as(f64, @floatFromInt(open_timer.read())) / + @as(f64, @floatFromInt(std.time.ns_per_ms)); + std.debug.print("Database opened successfully in {d} ms. Type: {s}\n", .{ + open_time_ms, + db.metadata.database_type, + }); + + var arena = std.heap.ArenaAllocator.init(allocator); + defer arena.deinit(); + const arena_allocator = arena.allocator(); + + std.debug.print("Starting benchmark...\n", .{}); + var timer = try std.time.Timer.start(); + var not_found_count: u64 = 0; + var lookup_errors: u64 = 0; + var ip_bytes: [4]u8 = undefined; + + for (0..num_lookups) |_| { + std.crypto.random.bytes(&ip_bytes); + const ip = std.net.Address.initIp4(ip_bytes, 0); + + const result = db.lookup( + arena_allocator, + MyCity, + ip, + .{}, + ) catch |err| { + std.debug.print("! Lookup error for IP {any}: {any}\n", .{ ip, err }); + lookup_errors += 1; + continue; + }; + if (result == null) { + not_found_count += 1; + continue; + } + + _ = arena.reset(.retain_capacity); + } + + const elapsed_ns = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / + @as(f64, @floatFromInt(std.time.ns_per_s)); + const lookups_per_second = if (elapsed_s > 0) + @as(f64, @floatFromInt(num_lookups)) / elapsed_s + else + 0.0; + const successful_lookups = num_lookups - not_found_count - lookup_errors; + + std.debug.print("\n--- Benchmark Finished ---\n", .{}); + std.debug.print("Total Lookups Attempted: {d}\n", .{num_lookups}); + std.debug.print("Successful Lookups: {d}\n", .{successful_lookups}); + std.debug.print("IPs Not Found: {d}\n", .{not_found_count}); + std.debug.print("Lookup Errors: {d}\n", .{lookup_errors}); + std.debug.print("Elapsed Time: {d} s\n", .{elapsed_s}); + std.debug.print("Lookups Per Second (avg):{d}\n", .{lookups_per_second}); +} diff --git a/build.zig b/build.zig index 04cd66b..fc3a1dd 100644 --- a/build.zig +++ b/build.zig @@ -30,6 +30,7 @@ pub fn build(b: *std.Build) void { .{ .file = "examples/within.zig", .name = "example_within" }, .{ .file = "examples/inspect.zig", .name = "example_inspect" }, .{ .file = "benchmarks/lookup.zig", .name = "benchmark_lookup" }, + .{ .file = "benchmarks/mycity.zig", .name = "benchmark_mycity" }, .{ .file = "benchmarks/inspect.zig", .name = "benchmark_inspect" }, }; diff --git a/examples/inspect.zig b/examples/inspect.zig index 502e2ff..8d2f970 100644 --- a/examples/inspect.zig +++ b/examples/inspect.zig @@ -11,8 +11,8 @@ pub fn main() !void { const db_path = if (args.len > 1) args[1] else "test-data/test-data/GeoIP2-City-Test.mmdb"; const ip = if (args.len > 2) args[2] else "89.160.20.128"; - var db = try maxminddb.Reader.mmap(allocator, db_path); - defer db.unmap(); + var db = try maxminddb.Reader.mmap(allocator, db_path, .{}); + defer db.close(); const result = try db.lookup( allocator, diff --git a/examples/lookup.zig b/examples/lookup.zig index 39d04c3..a36c155 100644 --- a/examples/lookup.zig +++ b/examples/lookup.zig @@ -2,16 +2,14 @@ const std = @import("std"); const maxminddb = @import("maxminddb"); const db_path = "test-data/test-data/GeoIP2-City-Test.mmdb"; -// We expect a DB file not larger than 1 GB. -const max_db_size: usize = 1024 * 1024 * 1024; pub fn main() !void { var gpa: std.heap.DebugAllocator(.{}) = .init; const allocator = gpa.allocator(); defer _ = gpa.detectLeaks(); - var db = try maxminddb.Reader.open(allocator, db_path, max_db_size); - defer db.close(allocator); + var db = try maxminddb.Reader.open(allocator, db_path, .{}); + defer db.close(); // Note, for better performance use arena allocator and reset it after calling lookup(). // You won't need to call city.deinit() in that case. diff --git a/examples/within.zig b/examples/within.zig index 77fa9a2..42c3815 100644 --- a/examples/within.zig +++ b/examples/within.zig @@ -8,8 +8,8 @@ pub fn main() !void { const allocator = gpa.allocator(); defer _ = gpa.detectLeaks(); - var db = try maxminddb.Reader.mmap(allocator, db_path); - defer db.unmap(); + var db = try maxminddb.Reader.mmap(allocator, db_path, .{}); + defer db.close(); const network = if (db.metadata.ip_version == 4) maxminddb.Network.all_ipv4 @@ -22,7 +22,6 @@ pub fn main() !void { // The iterator owns the values; each next() call invalidates the previous item. var n: usize = 0; while (try it.next()) |item| { - const continent = item.value.continent.code; const country = item.value.country.iso_code; var city: []const u8 = ""; diff --git a/src/maxminddb.zig b/src/maxminddb.zig index 83f773e..8db5363 100644 --- a/src/maxminddb.zig +++ b/src/maxminddb.zig @@ -15,6 +15,7 @@ pub const Result = reader.Result; pub const Metadata = reader.Metadata; pub const Iterator = reader.Iterator; pub const Network = net.Network; +pub const Options = reader.Options; pub const LookupOptions = reader.LookupOptions; pub const WithinOptions = reader.WithinOptions; pub const Map = collection.Map; @@ -138,8 +139,9 @@ test "GeoLite2 Country" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-Country-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geolite_country, DatabaseType.new(db.metadata.database_type)); @@ -191,8 +193,9 @@ test "GeoLite2 City" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geolite_city, DatabaseType.new(db.metadata.database_type)); @@ -262,8 +265,9 @@ test "GeoLite2 ASN" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-ASN-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geolite_asn, DatabaseType.new(db.metadata.database_type)); @@ -286,8 +290,9 @@ test "GeoIP2 Country" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Country-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_country, DatabaseType.new(db.metadata.database_type)); @@ -341,8 +346,9 @@ test "GeoIP2 Country RepresentedCountry" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Country-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const ip = try std.net.Address.parseIp("202.196.224.0", 0); const got = (try db.lookup(allocator, geoip2.Country, ip, .{})).?; @@ -366,8 +372,9 @@ test "GeoIP2 City" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_city, DatabaseType.new(db.metadata.database_type)); @@ -450,8 +457,9 @@ test "GeoIP2 Enterprise" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Enterprise-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_enterprise, DatabaseType.new(db.metadata.database_type)); @@ -549,8 +557,9 @@ test "GeoIP2 ISP" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-ISP-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_isp, DatabaseType.new(db.metadata.database_type)); @@ -573,8 +582,9 @@ test "GeoIP2 Connection-Type" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Connection-Type-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_connection_type, DatabaseType.new(db.metadata.database_type)); @@ -592,8 +602,9 @@ test "GeoIP2 Anonymous-IP" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Anonymous-IP-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_anonymous_ip, DatabaseType.new(db.metadata.database_type)); @@ -616,8 +627,9 @@ test "GeoIP Anonymous-Plus" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP-Anonymous-Plus-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_anonymous_plus, DatabaseType.new(db.metadata.database_type)); @@ -639,8 +651,9 @@ test "GeoIP2 DensityIncome" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-DensityIncome-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_densityincome, DatabaseType.new(db.metadata.database_type)); @@ -659,8 +672,9 @@ test "GeoIP2 Domain" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Domain-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_domain, DatabaseType.new(db.metadata.database_type)); @@ -678,8 +692,9 @@ test "GeoIP2 IP-Risk" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-IP-Risk-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_ip_risk, DatabaseType.new(db.metadata.database_type)); @@ -715,8 +730,9 @@ test "GeoIP2 Static-IP-Score" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Static-IP-Score-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_static_ip_score, DatabaseType.new(db.metadata.database_type)); @@ -734,8 +750,9 @@ test "GeoIP2 User-Count" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-User-Count-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); try expectEqual(DatabaseType.geoip_user_count, DatabaseType.new(db.metadata.database_type)); @@ -754,8 +771,9 @@ test "lookup with field name filtering" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const ip = try std.net.Address.parseIp("89.160.20.128", 0); @@ -783,8 +801,9 @@ test "lookup with custom record" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const MyCity = struct { city: struct { @@ -807,8 +826,9 @@ test "lookup with any.Value" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const ip = try std.net.Address.parseIp("89.160.20.128", 0); const got = (try db.lookup(allocator, any.Value, ip, .{})).?; @@ -829,8 +849,9 @@ test "lookup with any.Value and field name filtering" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const ip = try std.net.Address.parseIp("89.160.20.128", 0); const got = (try db.lookup( @@ -857,8 +878,9 @@ test "within returns all networks" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-City-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); var it = try db.within(allocator, geolite2.City, net.Network.all_ipv6, .{}); defer it.deinit(); @@ -873,8 +895,9 @@ test "within yields record when query prefix is narrower than record network" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoLite2-ASN-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); // 89.160.20.0/24 is inside the /17 record. // The iterator must still yield it even though the data record is found @@ -900,8 +923,9 @@ test "within yields record when start node is a data pointer" { var db = try Reader.mmap( allocator, "test-data/test-data/MaxMind-DB-no-ipv4-search-tree.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const network = try net.Network.parse("0.0.0.0/0"); var it = try db.within(allocator, any.Value, network, .{}); @@ -919,8 +943,9 @@ test "reject IPv6 on IPv4-only database" { var db = try Reader.mmap( allocator, "test-data/test-data/MaxMind-DB-test-ipv4-32.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); const network = try net.Network.parse("::/0"); const it = db.within(allocator, any.Value, network, .{}); @@ -935,8 +960,9 @@ test "within skips empty records" { var db = try Reader.mmap( allocator, "test-data/test-data/GeoIP2-Anonymous-IP-Test.mmdb", + .{}, ); - defer db.unmap(); + defer db.close(); // All records including empty. { diff --git a/src/net.zig b/src/net.zig index df4967b..4e50f05 100644 --- a/src/net.zig +++ b/src/net.zig @@ -150,22 +150,22 @@ pub const IP = union(enum) { .v4 => |b| { // Combines IP bytes into a big-endian u32, e.g., // 89.160.20.128 = 89 << 24 | 160 << 16 | 20 << 8 | 128 - const ipAsNumber = std.mem.readInt(u32, &b, .big); + const ip_as_number = std.mem.readInt(u32, &b, .big); const ones: u32 = std.math.maxInt(u32); const bitmask = if (prefix_len == 0) 0 else ones << @intCast(32 - prefix_len); var out: [4]u8 = undefined; - std.mem.writeInt(u32, &out, ipAsNumber & bitmask, .big); + std.mem.writeInt(u32, &out, ip_as_number & bitmask, .big); return .{ .v4 = out }; }, .v6 => |b| { - const ipAsNumber = std.mem.readInt(u128, &b, .big); + const ip_as_number = std.mem.readInt(u128, &b, .big); const ones: u128 = std.math.maxInt(u128); const bitmask = if (prefix_len == 0) 0 else ones << @intCast(128 - prefix_len); var out: [16]u8 = undefined; - std.mem.writeInt(u128, &out, ipAsNumber & bitmask, .big); + std.mem.writeInt(u128, &out, ip_as_number & bitmask, .big); return .{ .v6 = out }; }, diff --git a/src/reader.zig b/src/reader.zig index 2b2e400..3abd49b 100644 --- a/src/reader.zig +++ b/src/reader.zig @@ -11,6 +11,7 @@ pub const ReadError = error{ CorruptedTree, UnknownRecordSize, InvalidPrefixLen, + IndexAlreadyBuilt, IPv6AddressInIPv4Database, }; @@ -32,6 +33,31 @@ pub const Metadata = struct { const data_section_separator_size = 16; +// Maximum db size for Reader.open(). +// 64-bit: 20GB covers ~2.3B nodes (record_size=32) with ~2GB data section. +// 32-bit: 2GB matches the user-space address limit. +const max_db_size: usize = if (@sizeOf(usize) >= 8) + 20 * 1024 * 1024 * 1024 +else + 2 * 1024 * 1024 * 1024; + +pub const Options = struct { + /// Builds an index of the first N bits of IPv4 addresses to speed up lookups, + /// but not the within() iterator. + /// + /// It adds a one-time build cost of ~1-4ms and uses memory proportional to 2^N. + /// The first open is slower (~10-120ms) because page faults load the tree from disk. + /// Best suited for long-lived Readers with many lookups. + /// + /// Sparse databases such as Anonymous-IP or ISP benefit more (~70%-140%) + /// because tree traversal dominates whereas dense databases (City, Enterprise) + /// benefit less (~12%-18%) because record decoding is the bottleneck. + /// + /// The recommended value is 16 (~320KB, fits L2 cache), or 12 (~20KB) for constrained devices. + /// The valid range is between 0 and 24 where 0 disables the index. + ipv4_index_first_n_bits: u8 = 0, +}; + pub const LookupOptions = struct { only: ?[]const []const u8 = null, }; @@ -42,23 +68,31 @@ pub const WithinOptions = struct { }; pub const Reader = struct { + metadata: Metadata, src: []const u8, offset: usize, ipv4_start: usize, - metadata: Metadata, - metadata_arena: std.heap.ArenaAllocator, - - // Loads a MaxMind DB file into memory. - pub fn open(allocator: std.mem.Allocator, path: []const u8, max_db_size: usize) !Reader { - var f = try std.fs.cwd().openFile(path, .{}); - defer f.close(); - - const src = try f.readToEndAlloc(allocator, max_db_size); - errdefer allocator.free(src); - - var metadata_arena = std.heap.ArenaAllocator.init(allocator); - errdefer metadata_arena.deinit(); - const metadata = try decodeMetadata(metadata_arena.allocator(), src); + // ipv4_index is a flat array of tree node IDs and data offsets + // for fast lookup of IPv4 addresses by their first N bits. + // Instead of traversing the tree bit by bit from the root, + // the first N levels are pre-computed into a direct-access array. + ipv4_index_first_n_bits: u8, + ipv4_index: ?[]u32, + // ipv4_index_prefix_len stores the prefix length at which + // each terminal was reached during the index construction. + // This lets us return the correct prefix length + // without re-traversing the tree for terminal nodes in the index. + ipv4_index_prefix_len: ?[]u8, + is_mapped: bool, + arena: *std.heap.ArenaAllocator, + + fn init(arena: *std.heap.ArenaAllocator, src: []const u8, options: Options) !Reader { + const metadata = try decodeMetadata(arena.allocator(), src); + + switch (metadata.record_size) { + 24, 28, 32 => {}, + else => return ReadError.UnknownRecordSize, + } const search_tree_size = try std.math.mul( usize, @@ -71,68 +105,76 @@ pub const Reader = struct { } var r = Reader{ + .metadata = metadata, .src = src, .offset = data_offset, .ipv4_start = 0, - .metadata = metadata, - .metadata_arena = metadata_arena, + .ipv4_index_first_n_bits = options.ipv4_index_first_n_bits, + .ipv4_index = null, + .ipv4_index_prefix_len = null, + .is_mapped = false, + .arena = arena, }; - r.ipv4_start = try r.findIPv4Start(); + r.setIPv4Start(); + + if (r.ipv4_index_first_n_bits > 0) { + try r.buildIPv4Index(); + } return r; } - // Frees the memory occupied by the DB file. - // From this point all the DB records are unusable because their fields were backed by the same memory. - // Note, the records still have to be deinited since they might contain arrays or maps. - pub fn close(self: *Reader, allocator: std.mem.Allocator) void { - self.metadata_arena.deinit(); - allocator.free(self.src); + /// Loads a MaxMind DB file into memory. + pub fn open(allocator: std.mem.Allocator, path: []const u8, options: Options) !Reader { + var f = try std.fs.cwd().openFile(path, .{}); + defer f.close(); + + const arena = try allocator.create(std.heap.ArenaAllocator); + errdefer { + arena.deinit(); + allocator.destroy(arena); + } + arena.* = std.heap.ArenaAllocator.init(allocator); + + const src = try f.readToEndAlloc(arena.allocator(), max_db_size); + + return try init(arena, src, options); } - // Maps a MaxMind DB file into memory. - pub fn mmap(allocator: std.mem.Allocator, path: []const u8) !Reader { + /// Maps a MaxMind DB file into memory. + pub fn mmap(allocator: std.mem.Allocator, path: []const u8, options: Options) !Reader { const src = try memorymap.map(path); errdefer memorymap.unmap(src); - var metadata_arena = std.heap.ArenaAllocator.init(allocator); - errdefer metadata_arena.deinit(); - const metadata = try decodeMetadata(metadata_arena.allocator(), src); - - const search_tree_size = try std.math.mul( - usize, - metadata.node_count, - metadata.record_size / 4, - ); - const data_offset = search_tree_size + data_section_separator_size; - if (data_offset > src.len) { - return ReadError.CorruptedTree; + const arena = try allocator.create(std.heap.ArenaAllocator); + errdefer { + arena.deinit(); + allocator.destroy(arena); } + arena.* = std.heap.ArenaAllocator.init(allocator); - var r = Reader{ - .src = src, - .offset = data_offset, - .ipv4_start = 0, - .metadata = metadata, - .metadata_arena = metadata_arena, - }; - - r.ipv4_start = try r.findIPv4Start(); + var r = try init(arena, src, options); + r.is_mapped = true; return r; } - // Unmaps the DB file. - // From this point all the DB records are unusable because their fields were backed by the same memory. - // Note, the records still have to be deinited since they might contain arrays or maps. - pub fn unmap(self: *Reader) void { - self.metadata_arena.deinit(); - memorymap.unmap(self.src); + /// Frees the memory occupied by the DB file. + /// From this point all the DB records are unusable because their fields were backed by the same memory. + /// Note, the records still have to be deinited since they might contain arrays or maps. + pub fn close(self: *Reader) void { + const allocator = self.arena.child_allocator; + self.arena.deinit(); + allocator.destroy(self.arena); + + if (self.is_mapped) { + memorymap.unmap(self.src); + } } - // Looks up a value by an IP address. - // The returned Result owns an arena with all decoded allocations. + /// Looks up a value by an IP address. + /// The returned Result owns an arena with all decoded allocations. pub fn lookup( self: *Reader, allocator: std.mem.Allocator, @@ -145,13 +187,22 @@ pub const Reader = struct { return ReadError.IPv6AddressInIPv4Database; } - const pointer, const prefix_len = try self.findAddressInTree(ip); + var pointer: usize = 0; + var prefix_len: usize = 0; + if (self.ipv4_index != null and ip == .v4) { + pointer, prefix_len = try self.findAddressInTreeWithIndex(ip); + } else { + const start_node = self.startNode(ip.bitCount()); + pointer, prefix_len = try self.findAddressInTree(ip, start_node, 0); + } + if (pointer == 0) { return null; } var arena = std.heap.ArenaAllocator.init(allocator); errdefer arena.deinit(); + const value = try self.resolveDataPointerAndDecode( arena.allocator(), T, @@ -166,7 +217,7 @@ pub const Reader = struct { }; } - // Iterates over blocks of IP networks. + /// Iterates over blocks of IP networks. pub fn within( self: *Reader, allocator: std.mem.Allocator, @@ -197,7 +248,7 @@ pub const Reader = struct { var depth: usize = 0; if (node < node_count) { while (depth < prefix_len) { - node = try self.readNode(node, ip_bytes.bitAt(depth)); + node = self.readNode(node, ip_bytes.bitAt(depth)); depth += 1; if (node >= node_count) { break; @@ -240,6 +291,66 @@ pub const Reader = struct { return try d.decodeRecord(allocator, Metadata, null); } + fn buildIPv4Index(self: *Reader) !void { + if (self.ipv4_index_first_n_bits > 24) { + return ReadError.InvalidPrefixLen; + } + if (self.ipv4_index != null) { + return ReadError.IndexAlreadyBuilt; + } + + const index_size = std.math.shl(usize, 1, self.ipv4_index_first_n_bits); + self.ipv4_index = try self.arena.allocator().alloc(u32, index_size); + errdefer self.ipv4_index = null; + + self.ipv4_index_prefix_len = try self.arena.allocator().alloc(u8, index_size); + errdefer self.ipv4_index_prefix_len = null; + + self.populateIndex(self.ipv4_start, 0, index_size, 0); + } + + // Recursively traverses the first N levels of the search tree and fills the flat index array. + // Each index slot corresponds to an N-bit prefix, for example, + // slot 0000 covers all IPs starting with 0000. + // + // The range [start, start+count) tracks which slots belong to the current subtree. + // At each level we split in half: left child (0-bit) gets the lower half, + // right child (1-bit) gets the upper half. + // + // This works because the array is indexed by the N-bit prefix as a binary number: + // prefixes starting with 0 occupy the lower half of any range, + // prefixes starting with 1 occupy the upper half. + // + // When a node is terminal (data pointer or not-found) before depth N, + // we fill all remaining slots in the range with that node because + // every IP prefix in that range resolves to the same record. + fn populateIndex( + self: *Reader, + node: usize, + start: usize, + count: usize, + bit_depth: usize, + ) void { + // If the node is terminal or we've reached the max index depth, + // fill the range with this node. + if (count == 1 or node >= self.metadata.node_count) { + const node_u32: u32 = @intCast(node); + const prefix_len: u8 = @intCast(bit_depth); + + @memset(self.ipv4_index.?[start..][0..count], node_u32); + @memset(self.ipv4_index_prefix_len.?[start..][0..count], prefix_len); + + return; + } + + const half = count / 2; + const left_node = self.readNode(node, 0); + self.populateIndex(left_node, start, half, bit_depth + 1); + + const right_node = self.readNode(node, 1); + self.populateIndex(right_node, start + half, half, bit_depth + 1); + } + fn resolveDataPointerAndDecode( self: *Reader, allocator: std.mem.Allocator, @@ -282,20 +393,47 @@ pub const Reader = struct { return d.isEmptyMap(); } - fn findAddressInTree(self: *Reader, ip: net.IP) !struct { usize, usize } { - const bit_count = ip.bitCount(); - var node = self.startNode(bit_count); + // Uses the IPv4 index for fast lookups. + // The index covers the first N bits of the IPv4 address, allowing us to + // skip directly to the node at depth N instead of traversing bit by bit. + fn findAddressInTreeWithIndex(self: *Reader, ip: net.IP) !struct { usize, usize } { + const ip_int = std.mem.readInt(u32, &ip.v4, .big); + const index_pos = std.math.shr(usize, ip_int, 32 - self.ipv4_index_first_n_bits); + + const node: usize = self.ipv4_index.?[index_pos]; + + // If we hit a terminal at or before bit N of IPv4, return the prefix length + // that was stored during index construction. + if (node >= self.metadata.node_count) { + const prefix_len: usize = self.ipv4_index_prefix_len.?[index_pos]; + if (node == self.metadata.node_count) { + return .{ 0, prefix_len }; + } + return .{ node, prefix_len }; + } + // Continue traversal from where the index ends (bit N of IPv4 portion). + return try self.findAddressInTree(ip, node, self.ipv4_index_first_n_bits); + } + + fn findAddressInTree( + self: *Reader, + ip: net.IP, + start_node: usize, + start_bit: usize, + ) !struct { usize, usize } { + const stop_bit = ip.bitCount(); const node_count: usize = self.metadata.node_count; - var prefix_len = bit_count; - for (0..bit_count) |i| { + var node = start_node; + var prefix_len = stop_bit; + for (start_bit..stop_bit) |i| { if (node >= node_count) { prefix_len = i; break; } - node = try self.readNode(node, ip.bitAt(i)); + node = self.readNode(node, ip.bitAt(i)); } if (node == node_count) { @@ -313,26 +451,25 @@ pub const Reader = struct { return if (length == 128) 0 else self.ipv4_start; } - fn findIPv4Start(self: *Reader) !usize { + fn setIPv4Start(self: *Reader) void { if (self.metadata.ip_version != 6) { - return 0; + return; } + const node_count: usize = self.metadata.node_count; + // We are looking up an IPv4 address in an IPv6 tree. // Skip over the first 96 nodes. var node: usize = 0; - for (0..96) |_| { - if (node >= self.metadata.node_count) { - break; - } - - node = try self.readNode(node, 0); + var i: usize = 0; + while (i < 96 and node < node_count) : (i += 1) { + node = self.readNode(node, 0); } - return node; + self.ipv4_start = node; } - fn readNode(self: *Reader, node_number: usize, index: usize) !usize { + fn readNode(self: *Reader, node_number: usize, index: usize) usize { const src = self.src; const base_offset: usize = node_number * self.metadata.record_size / 4; @@ -356,7 +493,7 @@ pub const Reader = struct { const offset = base_offset + index * 4; return decoder.toUsize(src[offset .. offset + 4], 0); }, - else => ReadError.UnknownRecordSize, + else => unreachable, }; } @@ -530,7 +667,7 @@ pub fn Iterator(T: type) type { }; } else if (current.node < self.node_count) { // In order traversal of the children on the right (1-bit). - var node = try reader.readNode(current.node, 1); + var node = reader.readNode(current.node, 1); var right_ip_bytes = current.ip_bytes; if (current.prefix_len < bit_count) { @@ -548,7 +685,7 @@ pub fn Iterator(T: type) type { }); // In order traversal of the children on the left (0-bit). - node = try reader.readNode(current.node, 0); + node = reader.readNode(current.node, 0); self.stack.appendAssumeCapacity(WithinNode{ .node = node, .ip_bytes = current.ip_bytes,