From 09689145cd57ebc368b234b5bc028ffa3c02ae3e Mon Sep 17 00:00:00 2001 From: Sam Williams Date: Thu, 15 Jan 2026 14:29:52 -0500 Subject: [PATCH 01/29] wip: Arweave TXID->offset indexing in `copycat@1.0` --- src/dev_arweave.erl | 28 +++++++++++++++++++- src/dev_copycat_arweave.erl | 51 +++++++++++++++++++++++++++++++++++-- src/hb_opts.erl | 6 +++++ src/hb_store_arweave.erl | 43 +++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 src/hb_store_arweave.erl diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index f40a188d2..c5dd4e38d 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -9,6 +9,8 @@ -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). +-define(CHUNK_SIZE, 256 * 1024). % Arweave's default chunk size in bytes. + %% @doc Proxy the `/info' endpoint from the Arweave node. status(_Base, _Request, Opts) -> request(<<"GET">>, <<"/info">>, Opts). @@ -113,7 +115,7 @@ get_tx(Base, Request, Opts) -> chunk(Base, Request, Opts) -> case hb_maps:get(<<"method">>, Request, <<"GET">>, Opts) of <<"POST">> -> post_chunk(Base, Request, Opts); - <<"GET">> -> {error, not_implemented} + <<"GET">> -> get_chunk(Base, Request, Opts) end. post_chunk(_Base, Request, Opts) -> @@ -128,6 +130,30 @@ post_chunk(_Base, Request, Opts) -> Opts ). +get_chunk(_Base, Request, Opts) -> + Offset = hb_util:int(hb_ao:get(<<"offset">>, Request, Opts)), + Length = hb_util:int(hb_ao:get(<<"length">>, Request, ?CHUNK_SIZE, Opts)), + Res = + hb_http:get( + hb_opts:get(gateway, not_found, Opts), + #{ + <<"path">> => + << + "/chunk/", + (hb_util:bin(Offset))/binary, + "/", + (hb_util:bin(Length))/binary + >> + }, + Opts + ), + case Res of + {ok, #{ <<"body">> := Body }} -> + {ok, Body}; + {error, Reason} -> + {error, Reason} + end. + add_data(TXID, TXHeader, Opts) -> case data(TXID, Opts) of {ok, Data} -> diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 6ff8f822f..a953a65ad 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -9,6 +9,9 @@ -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9-pre">>). +-define(ARWEAVE_INDEX_PATH, <>). + +% GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave %% @doc Fetch blocks from an Arweave node between a given range, or from the %% latest known block towards the Genesis block. If no range is provided, we @@ -56,9 +59,10 @@ fetch_blocks(Req, Current, To, Opts) -> fetch_blocks(Req, Current - 1, To, Opts). %% @doc Process a block. -process_block(BlockRes, _Req, Current, To, _Opts) -> +process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of - {ok, _} -> + {ok, Block} -> + maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, @@ -74,4 +78,47 @@ process_block(BlockRes, _Req, Current, To, _Opts) -> {target, To} } ) + end. + +%% @doc Index the IDs of all transactions in the block if configured to do so. +maybe_index_ids(Block, Opts) -> + case hb_opts:get(arweave_index_ids, false, Opts) of + false -> ok; + true -> + IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), + BlockOffset = hb_maps:get(<<"weave_size">>, Block, 0, Opts), + lists:foreach( + fun(TXID) -> + TX = + hb_ao:get( + << + ?ARWEAVE_DEVICE/binary, + "/tx=", + (hb_util:bin(TXID))/binary + >>, + Opts + ), + TXOffset = hb_maps:get(<<"offset">>, TX, 0, Opts), + case is_bundle_tx(TX, Opts) of + false -> ok; + true -> + {ok, BundleIndex} = download_bundle_header(TXID, Opts), + hb_maps:map( + fun(ItemID, #{ <<"offset">> := BundleOffset, <<"length">> := Length}) -> + Offset = hb_util:bin(BundleOffset + TXOffset + BlockOffset), + hb_store_arweave:write_offset( + IndexStore, + ItemID, + Offset, + Length + ) + end, + BundleIndex, + Opts + ) + end + end, + hb_maps:get(<<"txs">>, Block, #{}, Opts), + Opts + ) end. \ No newline at end of file diff --git a/src/hb_opts.erl b/src/hb_opts.erl index 5536ad523..60e665a04 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -314,6 +314,12 @@ default_message() -> <<"store-module">> => hb_store_fs, <<"name">> => <<"cache-mainnet">> }, + #{ + <<"store-module">> => hb_store_arweave, + <<"name">> => <<"cache-arweave">>, + <<"index-store">> => [?DEFAULT_PRIMARY_STORE], + <<"arweave-node">> => <<"https://arweave.net">> + }, #{ <<"store-module">> => hb_store_gateway, <<"subindex">> => [ diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl new file mode 100644 index 000000000..49e03cce2 --- /dev/null +++ b/src/hb_store_arweave.erl @@ -0,0 +1,43 @@ +%%% @doc A store implementation that relays to an Arweave node, using an +%%% intermediate cache of offsets as an ID->ArweaveLocation mapping. +-module(hb_store_arweave). +%%% Store API: +-export([read/2]). +%%% Indexing API: +-export([write_offset/4]). + +-define(ARWEAVE_INDEX_PATH, <<"~arweave@2.9-pre/offset">>). + +read(StoreOpts = #{ <<"arweave-index-store">> := IndexStore }, ID) -> + Path = + << + ?ARWEAVE_INDEX_PATH/binary, + "/", + (hb_util:bin(ID))/binary + >>, + case hb_store:read(IndexStore, Path) of + {ok, Binary} -> + [Offset, Length] = binary:split(Binary, <<":">>, [global]), + SerializedItem = + hb_ao:get( + << + "~arweave@2.9-pre/chunk&offset=", + Offset/binary, + "&length=", + Length/binary + >>, + StoreOpts + ), + {ok, ar_bundles:deserialize(SerializedItem)}; + {error, not_found} -> + {error, not_found} + end. + +write_offset(Store, ID, Offset, Length) -> + hb_store:write( + Store, + << + ?ARWEAVE_INDEX_PATH/binary, "/", + (hb_util:bin(ID))/binary>>, <> + ). \ No newline at end of file From 69fe4cf9b2e53d0b86d555ea2b3dd15fbe899aca Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 16 Jan 2026 10:55:28 -0500 Subject: [PATCH 02/29] fix: only treat path segments as ids if they can't further segmented --- src/hb_singleton.erl | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/hb_singleton.erl b/src/hb_singleton.erl index 7b086afc8..a38cf957c 100644 --- a/src/hb_singleton.erl +++ b/src/hb_singleton.erl @@ -335,19 +335,20 @@ do_build(I, [Msg | Rest], ScopedKeys, Opts) -> %% 2. Part subpath resolutions %% 3. Inlined key-value pairs %% 4. Device specifier -parse_part(ID, _Opts) when ?IS_ID(ID) -> ID; parse_part(Part, Opts) -> case maybe_subpath(Part, Opts) of {resolve, Subpath} -> {resolve, Subpath}; Part -> case part([$&, $~, $+, $ , $=], Part) of + {no_match, PartKey, <<>>} when ?IS_ID(PartKey) -> + PartKey; {no_match, PartKey, <<>>} -> #{ <<"path">> => PartKey }; {Sep, PartKey, PartModBin} -> parse_part_mods( << Sep:8/integer, PartModBin/binary >>, #{ <<"path">> => PartKey }, - Opts + Opts ) end end. @@ -772,6 +773,20 @@ inlined_keys_test() -> ?assertEqual(not_found, hb_maps:get(<<"k1">>, Base, not_found)), ?assertEqual(not_found, hb_maps:get(<<"k2">>, Msg2, not_found)). +inlined_keys_long_segment_test() -> + Req = #{ + <<"path">> => + <<"/chunk&offset=377813969707255&length=262144">> + }, + Msgs = from(Req, #{}), + ?assertEqual(2, length(Msgs)), + [Base, Msg] = Msgs, + ?assertEqual(<<"chunk">>, hb_maps:get(<<"path">>, Msg)), + ?assertEqual(<<"377813969707255">>, hb_maps:get(<<"offset">>, Msg)), + ?assertEqual(<<"262144">>, hb_maps:get(<<"length">>, Msg)), + ?assertEqual(not_found, hb_maps:get(<<"offset">>, Base, not_found)), + ?assertEqual(not_found, hb_maps:get(<<"length">>, Base, not_found)). + inlined_quoted_key_test() -> Req = #{ <<"method">> => <<"POST">>, From f6532ed036a337537b8b60204c5cca7a01347a12 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 16 Jan 2026 16:49:59 -0500 Subject: [PATCH 03/29] wip: implement ~arweave@2.9-pre/chunk --- src/dev_arweave.erl | 112 +++++++++++++++++++++++++++++------- src/dev_copycat_arweave.erl | 89 ++++++++++++++-------------- src/hb_opts.erl | 2 +- src/hb_store_arweave.erl | 100 ++++++++++++++++++++++++-------- src/hb_util.erl | 6 +- 5 files changed, 220 insertions(+), 89 deletions(-) diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index c5dd4e38d..dd6145c33 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -9,8 +9,6 @@ -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). --define(CHUNK_SIZE, 256 * 1024). % Arweave's default chunk size in bytes. - %% @doc Proxy the `/info' endpoint from the Arweave node. status(_Base, _Request, Opts) -> request(<<"GET">>, <<"/info">>, Opts). @@ -113,9 +111,10 @@ get_tx(Base, Request, Opts) -> end. chunk(Base, Request, Opts) -> + ?event(debug_test, {chunk, {base, Base}, {request, Request}, {opts, Opts}}), case hb_maps:get(<<"method">>, Request, <<"GET">>, Opts) of <<"POST">> -> post_chunk(Base, Request, Opts); - <<"GET">> -> get_chunk(Base, Request, Opts) + <<"GET">> -> get_chunk_range(Base, Request, Opts) end. post_chunk(_Base, Request, Opts) -> @@ -130,26 +129,46 @@ post_chunk(_Base, Request, Opts) -> Opts ). -get_chunk(_Base, Request, Opts) -> +get_chunk_range(_Base, Request, Opts) -> Offset = hb_util:int(hb_ao:get(<<"offset">>, Request, Opts)), - Length = hb_util:int(hb_ao:get(<<"length">>, Request, ?CHUNK_SIZE, Opts)), - Res = - hb_http:get( - hb_opts:get(gateway, not_found, Opts), - #{ - <<"path">> => - << - "/chunk/", - (hb_util:bin(Offset))/binary, - "/", - (hb_util:bin(Length))/binary - >> - }, - Opts - ), + Length = hb_util:int(hb_ao:get(<<"length">>, Request, ?DATA_CHUNK_SIZE, Opts)), + case get_chunk_range(Offset, Length, Opts, [], 0) of + {ok, Chunks} -> + Data = iolist_to_binary(Chunks), + Truncated = + case byte_size(Data) > Length of + true -> binary:part(Data, 0, Length); + false -> Data + end, + {ok, Truncated}; + {error, Reason} -> + {error, Reason} + end. + +get_chunk_range(_Offset, Length, _Opts, Chunks, Size) + when Size >= Length -> + {ok, lists:reverse(Chunks)}; +get_chunk_range(Offset, Length, Opts, Chunks, Size) -> + case get_chunk(Offset, Opts) of + {ok, Chunk} -> + get_chunk_range( + Offset + byte_size(Chunk), + Length, + Opts, + [Chunk | Chunks], + Size + byte_size(Chunk) + ); + {error, Reason} -> + {error, Reason} + end. + +get_chunk(Offset, Opts) -> + Path = <<"/chunk/", (hb_util:bin(Offset))/binary>>, + Res = request(<<"GET">>, Path, Opts), case Res of - {ok, #{ <<"body">> := Body }} -> - {ok, Body}; + {ok, JSON } -> + Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), + {ok, Chunk}; {error, Reason} -> {error, Reason} end. @@ -514,3 +533,54 @@ serialize_data_item_test_disabled() -> ?assertEqual(length(DataItem#tx.tags), length(VerifiedItem#tx.tags)), ?assert(ar_bundles:verify_item(VerifiedItem)), ok. + +get_partial_chunk_test() -> + Offset = 377813969707255, + ExpectedHash = hb_util:decode(<<"9hCmSqNi2sRo45SBtlY8JUePrkh4v2ZSTTuVga0FMSs">>), + ExpectedLength = 1000, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + #{} + ), + ?assertEqual(ExpectedLength, byte_size(Data)), + ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), + ok. + +get_full_chunk_test() -> + Offset = 377813969707255, + ExpectedHash = hb_util:decode(<<"y3HKeRa2hyEnmwJUd45A2qrtYJcvpC7tCAZgD5qyJ5I">>), + ExpectedLength = ?DATA_CHUNK_SIZE, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + #{} + ), + ?assertEqual(ExpectedLength, byte_size(Data)), + ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), + ok. + +get_multi_chunk_test() -> + Offset = 377813969707255, + ExpectedHash = hb_util:decode(<<"G2EFuA2NGQYOszQZbsEL0QG/lyUzQAxM09/gpBHf88E=">>), + ExpectedLength = 526685, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + #{} + ), + ?assertEqual(ExpectedLength, byte_size(Data)), + ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), + ok. \ No newline at end of file diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index a953a65ad..a476994b7 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -62,7 +62,7 @@ fetch_blocks(Req, Current, To, Opts) -> process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of {ok, Block} -> - maybe_index_ids(Block, Opts), + % maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, @@ -81,44 +81,49 @@ process_block(BlockRes, _Req, Current, To, Opts) -> end. %% @doc Index the IDs of all transactions in the block if configured to do so. -maybe_index_ids(Block, Opts) -> - case hb_opts:get(arweave_index_ids, false, Opts) of - false -> ok; - true -> - IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), - BlockOffset = hb_maps:get(<<"weave_size">>, Block, 0, Opts), - lists:foreach( - fun(TXID) -> - TX = - hb_ao:get( - << - ?ARWEAVE_DEVICE/binary, - "/tx=", - (hb_util:bin(TXID))/binary - >>, - Opts - ), - TXOffset = hb_maps:get(<<"offset">>, TX, 0, Opts), - case is_bundle_tx(TX, Opts) of - false -> ok; - true -> - {ok, BundleIndex} = download_bundle_header(TXID, Opts), - hb_maps:map( - fun(ItemID, #{ <<"offset">> := BundleOffset, <<"length">> := Length}) -> - Offset = hb_util:bin(BundleOffset + TXOffset + BlockOffset), - hb_store_arweave:write_offset( - IndexStore, - ItemID, - Offset, - Length - ) - end, - BundleIndex, - Opts - ) - end - end, - hb_maps:get(<<"txs">>, Block, #{}, Opts), - Opts - ) - end. \ No newline at end of file +% maybe_index_ids(Block, Opts) -> +% case hb_opts:get(arweave_index_ids, false, Opts) of +% false -> ok; +% true -> +% IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), +% BlockOffset = hb_maps:get(<<"weave_size">>, Block, 0, Opts), +% lists:foreach( +% fun(TXID) -> +% TX = +% hb_ao:get( +% << +% ?ARWEAVE_DEVICE/binary, +% "/tx=", +% (hb_util:bin(TXID))/binary +% >>, +% Opts +% ), +% TXOffset = hb_maps:get(<<"offset">>, TX, 0, Opts), +% case is_bundle_tx(TX, Opts) of +% false -> ok; +% true -> +% {ok, BundleIndex} = download_bundle_header(TXID, Opts), +% hb_maps:map( +% fun(ItemID, #{ <<"offset">> := BundleOffset, <<"length">> := Length}) -> +% Offset = hb_util:bin(BundleOffset + TXOffset + BlockOffset), +% hb_store_arweave:write_offset( +% IndexStore, +% ItemID, +% Offset, +% Length +% ) +% end, +% BundleIndex, +% Opts +% ) +% end +% end, +% hb_maps:get(<<"txs">>, Block, #{}, Opts), +% Opts +% ) +% end. +% ok. + + +%%% Tests + diff --git a/src/hb_opts.erl b/src/hb_opts.erl index 60e665a04..7d458b157 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -293,7 +293,7 @@ default_message() -> <<"node">> => #{ <<"match">> => <<"^/arweave">>, - <<"with">> => <<"https://arweave.net">>, + <<"with">> => <<"http://tip-3.arweave.xyz:1984">>, <<"opts">> => #{ http_client => httpc, protocol => http2 } } }, diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index 49e03cce2..0b9e224d0 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -4,40 +4,92 @@ %%% Store API: -export([read/2]). %%% Indexing API: --export([write_offset/4]). +-export([write_offset/5]). +-include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_INDEX_PATH, <<"~arweave@2.9-pre/offset">>). read(StoreOpts = #{ <<"arweave-index-store">> := IndexStore }, ID) -> - Path = - << - ?ARWEAVE_INDEX_PATH/binary, - "/", - (hb_util:bin(ID))/binary - >>, - case hb_store:read(IndexStore, Path) of + case hb_store:read(IndexStore, path(ID)) of {ok, Binary} -> - [Offset, Length] = binary:split(Binary, <<":">>, [global]), - SerializedItem = - hb_ao:get( - << - "~arweave@2.9-pre/chunk&offset=", - Offset/binary, - "&length=", - Length/binary - >>, - StoreOpts - ), - {ok, ar_bundles:deserialize(SerializedItem)}; + [IsTX, Offset, Length] = binary:split(Binary, <<":">>, [global]), + ?event( + debug_test, + {reading_offset, {is_tx, IsTX}, {offset, Offset}, {length, Length}} + ), + case IsTX of + <<"1">> -> + load_bundle(ID, Offset, Length, StoreOpts); + <<"0">> -> + load_item(Offset, Length, StoreOpts) + end; {error, not_found} -> {error, not_found} end. -write_offset(Store, ID, Offset, Length) -> +load_item(Offset, Length, Opts) -> + case read_chunks(Offset, Length, Opts) of + {ok, SerializedItem} -> + ar_bundles:deserialize(SerializedItem); + {error, Reason} -> + {error, Reason} + end. + +load_bundle(ID, Offset, Length, Opts) -> + + case read_chunks(Offset, Length, Opts) of + {ok, SerializedItem} -> + ar_bundles:deserialize(SerializedItem); + {error, Reason} -> + {error, Reason} + end. + +read_chunks(Offset, Length, Opts) -> + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => Length + }, + Opts + ). + +write_offset(Store, ID, IsTX, Offset, Length) -> + IsTxInt = hb_util:bool_int(IsTX), hb_store:write( Store, + path(ID), << - ?ARWEAVE_INDEX_PATH/binary, "/", - (hb_util:bin(ID))/binary>>, <> - ). \ No newline at end of file + ). + +path(ID) -> + << + ?ARWEAVE_INDEX_PATH/binary, + "/", + (hb_util:bin(ID))/binary + >>. + + +%%% Tests + +write_read_tx_test() -> + Store = [hb_test_utils:test_store()], + ?event(debug_test, {store, Store}), + Opts = #{ + <<"arweave-index-store">> => Store + }, + ID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, + Offset = 363524457275639, + Length = 8387, + ok = write_offset(Store, ID, true, Offset, Length), + {ok, Item} = read(Opts, ID), + ?event(debug_test, {item, Item}), + ok. \ No newline at end of file diff --git a/src/hb_util.erl b/src/hb_util.erl index 5a3502014..c920cd699 100644 --- a/src/hb_util.erl +++ b/src/hb_util.erl @@ -1,6 +1,6 @@ %% @doc A collection of utility functions for building with HyperBEAM. -module(hb_util). --export([int/1, float/1, atom/1, bin/1, list/1, map/1]). +-export([int/1, float/1, atom/1, bin/1, list/1, map/1, bool_int/1]). -export([safe_int/1]). -export([ceil_int/2, floor_int/2]). -export([id/1, id/2, native_id/1, human_id/1, human_int/1, to_hex/1]). @@ -81,6 +81,10 @@ bin(Value) when is_list(Value) -> bin(Value) when is_binary(Value) -> Value. +%% @doc Coerce a boolean to 1 or 0. +bool_int(true) -> 1; +bool_int(false) -> 0. + %% @doc Coerce a value to a string list. list(Value) when is_binary(Value) -> binary_to_list(Value); From d767535c694802db79e60d8c7aa2d847ccfd688e Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 16 Jan 2026 21:21:37 -0500 Subject: [PATCH 04/29] test: make dev_arweave tests more reliable --- src/dev_arweave.erl | 149 +++++++++++++++++++++++++++++++------------- src/hb_opts.erl | 2 +- 2 files changed, 108 insertions(+), 43 deletions(-) diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index dd6145c33..a2dc36e50 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -383,6 +383,37 @@ to_message(Path, {ok, #{ <<"body">> := Body }}, Opts) -> %%% Tests +start_mock_gateway(ChunkData) -> + Endpoints = [ + {"/chunk/:offset", chunk, + fun(Req) -> + Path = maps:get(<<"path">>, Req, <<>>), + Segments = binary:split(Path, <<"/">>, [global, trim_all]), + OffsetValue = hb_util:int(lists:last(Segments)), + case maps:get(OffsetValue, ChunkData, undefined) of + undefined -> + {404, <<"not found">>}; + Data -> + Body = hb_json:encode(#{ <<"chunk">> => hb_util:encode(Data) }), + {200, Body} + end + end} + ], + {ok, MockServer, ServerHandle} = hb_mock_server:start(Endpoints), + Opts = #{ + routes => [ + #{ + <<"template">> => <<"/arweave">>, + <<"node">> => #{ + <<"match">> => <<"^/arweave">>, + <<"with">> => MockServer, + <<"opts">> => #{ http_client => httpc, protocol => http2 } + } + } + ] + }, + {ServerHandle, Opts}. + post_ans104_tx_test() -> ServerOpts = #{ store => [hb_test_utils:test_store()] }, Server = hb_http_server:start_node(ServerOpts), @@ -429,9 +460,14 @@ post_ans104_tx_test() -> ok. get_tx_basic_data_test() -> - Node = hb_http_server:start_node(), - Path = <<"/~arweave@2.9-pre/tx=ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">>, - {ok, Structured} = hb_http:get(Node, Path, #{}), + {ok, Structured} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"tx">>, + <<"tx">> => <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">> + }, + #{} + ), ?event(debug_test, {structured_tx, Structured}), ?assert(hb_message:verify(Structured, all, #{})), % Hash the data to make it easier to match @@ -534,53 +570,82 @@ serialize_data_item_test_disabled() -> ?assert(ar_bundles:verify_item(VerifiedItem)), ok. +%% Note: the following tests use a mock gateway since https://arweave.net/chunk +%% is not reliable. The current implementation has it delegating to a random +%% backing node, and depending on which packing node is selected the request +%% may return valid data or 404. To make the tests reliable, we instead use +%% a mock server. get_partial_chunk_test() -> Offset = 377813969707255, - ExpectedHash = hb_util:decode(<<"9hCmSqNi2sRo45SBtlY8JUePrkh4v2ZSTTuVga0FMSs">>), ExpectedLength = 1000, - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - #{} - ), - ?assertEqual(ExpectedLength, byte_size(Data)), - ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), - ok. + Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), + ChunkData = #{ Offset => Chunk0 }, + {ServerHandle, Opts} = start_mock_gateway(ChunkData), + try + ExpectedData = binary:part(Chunk0, 0, ExpectedLength), + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual(ExpectedData, Data), + ok + after + hb_mock_server:stop(ServerHandle) + end. get_full_chunk_test() -> Offset = 377813969707255, - ExpectedHash = hb_util:decode(<<"y3HKeRa2hyEnmwJUd45A2qrtYJcvpC7tCAZgD5qyJ5I">>), ExpectedLength = ?DATA_CHUNK_SIZE, - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - #{} - ), - ?assertEqual(ExpectedLength, byte_size(Data)), - ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), - ok. + Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), + ChunkData = #{ Offset => Chunk0 }, + {ServerHandle, Opts} = start_mock_gateway(ChunkData), + try + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual(Chunk0, Data), + ok + after + hb_mock_server:stop(ServerHandle) + end. get_multi_chunk_test() -> Offset = 377813969707255, - ExpectedHash = hb_util:decode(<<"G2EFuA2NGQYOszQZbsEL0QG/lyUzQAxM09/gpBHf88E=">>), ExpectedLength = 526685, - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - #{} - ), - ?assertEqual(ExpectedLength, byte_size(Data)), - ?assertEqual(ExpectedHash, crypto:hash(sha256, Data)), - ok. \ No newline at end of file + Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), + Chunk1 = crypto:strong_rand_bytes(132270), + Chunk2 = crypto:strong_rand_bytes(132271), + ChunkData = #{ + Offset => Chunk0, + Offset + ?DATA_CHUNK_SIZE => Chunk1, + Offset + ?DATA_CHUNK_SIZE + 132270 => Chunk2 + }, + {ServerHandle, Opts} = start_mock_gateway(ChunkData), + try + ExpectedData = binary:part( + iolist_to_binary([Chunk0, Chunk1, Chunk2]), 0, ExpectedLength), + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual(ExpectedData, Data), + ok + after + hb_mock_server:stop(ServerHandle) + end. \ No newline at end of file diff --git a/src/hb_opts.erl b/src/hb_opts.erl index 7d458b157..60e665a04 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -293,7 +293,7 @@ default_message() -> <<"node">> => #{ <<"match">> => <<"^/arweave">>, - <<"with">> => <<"http://tip-3.arweave.xyz:1984">>, + <<"with">> => <<"https://arweave.net">>, <<"opts">> => #{ http_client => httpc, protocol => http2 } } }, From bb450221308d2a4952ee90aaaf88b2f254917394 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 19 Jan 2026 08:35:26 -0500 Subject: [PATCH 05/29] wip: support L1 TX messages that have data_size/data_root but no data (i.e. true TX headers) Specify exclude-data=1 to exclude the data --- src/ar_bundles.erl | 6 +- src/ar_format.erl | 2 +- src/dev_arweave.erl | 135 ++++++++++++++++++++++++------------- src/dev_arweave_common.erl | 3 +- src/dev_codec_tx.erl | 97 ++++++++++++++++---------- src/dev_codec_tx_from.erl | 17 ++++- src/dev_codec_tx_to.erl | 50 +++++++++++++- src/hb_util.erl | 16 ++++- src/include/ar.hrl | 3 +- 9 files changed, 237 insertions(+), 92 deletions(-) diff --git a/src/ar_bundles.erl b/src/ar_bundles.erl index 3f070b54a..fc838ebaa 100644 --- a/src/ar_bundles.erl +++ b/src/ar_bundles.erl @@ -430,14 +430,14 @@ maybe_unbundle(Item) -> unbundle_list(Item) -> case unbundle(Item#tx.data) of - detached -> Item#tx{data = detached}; + ?DEFAULT_DATA -> Item#tx{data = ?DEFAULT_DATA}; Items -> Item#tx{data = hb_util:list_to_numbered_message(Items)} end. unbundle_map(Item) -> MapTXID = dev_arweave_common:tagfind(<<"bundle-map">>, Item#tx.tags, <<>>), case unbundle(Item#tx.data) of - detached -> Item#tx{data = detached}; + ?DEFAULT_DATA -> Item#tx{data = ?DEFAULT_DATA}; Items -> MapItem = find_single_layer(hb_util:decode(MapTXID), Items), Map = hb_json:decode(MapItem#tx.data), @@ -469,7 +469,7 @@ find_single_layer(UnsignedID, Items) -> unbundle(<>) -> {ItemsBin, Items} = decode_bundle_header(Count, Content), decode_bundle_items(Items, ItemsBin); -unbundle(<<>>) -> detached. +unbundle(?DEFAULT_DATA) -> ?DEFAULT_DATA. decode_bundle_items([], <<>>) -> []; diff --git a/src/ar_format.erl b/src/ar_format.erl index 8ee18d8bb..5950e8f4a 100644 --- a/src/ar_format.erl +++ b/src/ar_format.erl @@ -65,7 +65,7 @@ format(TX, Indent, Opts) when is_record(TX, tx) -> [hb_util:safe_encode(ar_bundles:signer(TX))], Indent + 1), format_line("Signature: ~s", - [hb_format:binary(TX#tx.signature)], + [hb_format:binary(TX#tx.signature, Opts)], Indent + 1); false -> [] end ++ diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index a2dc36e50..4e931f1a5 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -4,7 +4,7 @@ %%% The node(s) that are used to query data may be configured by altering the %%% `/arweave` route in the node's configuration message. -module(dev_arweave). --export([tx/3, chunk/3, block/3, current/3, status/3, price/3, tx_anchor/3]). +-export([tx/3, raw/3, chunk/3, block/3, current/3, status/3, price/3, tx_anchor/3]). -export([post_tx/3, post_tx/4, post_binary_ans104/2]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -13,8 +13,9 @@ status(_Base, _Request, Opts) -> request(<<"GET">>, <<"/info">>, Opts). -%% @doc Returns the given transaction, if known to the client node(s), as an -%% AO-Core message. +%% @doc Returns the given transaction as an AO-Core message. By default, this +%% embeds the `/raw` payload. Set `exclude-data` to true to return just the +%% header. tx(Base, Request, Opts) -> case hb_maps:get(<<"method">>, Request, <<"GET">>, Opts) of <<"POST">> -> post_tx(Base, Request, Opts); @@ -98,18 +99,33 @@ post_binary_ans104(SerializedTX, Opts) -> } ). -%% @doc Get a transaction ID from the Arweave node, as indicated by the `tx` key -%% in the request or base message. If the `data' key is present and set to -%% `false', the data is not retrieved and added to the response. If the `data' -%% key is set to `always', transactions for which the header is available but -%% the data is not will lead to an error. Otherwise, just the header will be -%% returned. +%% @doc Get a transaction from the Arweave node, as indicated by the +%% `tx` key in the request or base message. By default, this embeds the data +%% payload. Set `exclude_data` to true to return just the header. get_tx(Base, Request, Opts) -> case find_txid(Base, Request, Opts) of not_found -> {error, not_found}; - TXID -> request(<<"GET">>, <<"/tx/", TXID/binary>>, Opts) + TXID -> + request( + <<"GET">>, + <<"/tx/", TXID/binary>>, + Opts#{ exclude_data => exclude_data(Base, Request, Opts) } + ) + end. + +%% @doc Get raw transaction data from the Arweave node, as indicated by the +%% `tx` key in the request or base message. +raw(Base, Request, Opts) -> + case find_txid(Base, Request, Opts) of + not_found -> {error, not_found}; + TXID -> data(TXID, Opts) end. +%% @doc Retrieve the data of a transaction from Arweave. +data(TXID, Opts) -> + ?event({retrieving_tx_data, {tx, TXID}}), + request(<<"GET">>, <<"/raw/", TXID/binary>>, Opts). + chunk(Base, Request, Opts) -> ?event(debug_test, {chunk, {base, Base}, {request, Request}, {opts, Opts}}), case hb_maps:get(<<"method">>, Request, <<"GET">>, Opts) of @@ -173,33 +189,6 @@ get_chunk(Offset, Opts) -> {error, Reason} end. -add_data(TXID, TXHeader, Opts) -> - case data(TXID, Opts) of - {ok, Data} -> - TX = TXHeader#tx{ data = Data }, - ?event( - {retrieved_tx_with_data, - {id, TXID}, - {data_size, byte_size(Data)}, - {tx, TX} - } - ), - {ok, TX}; - {error, Reason} -> - ?event( - {data_retrieval_failed_after_header, - {id, TXID}, - {error, Reason} - } - ), - {error, Reason} - end. - -%% @doc Retrieve the data of a transaction from Arweave. -data(TXID, Opts) -> - ?event({retrieving_tx_data, {tx, TXID}}), - request(<<"GET">>, <<"/raw/", TXID/binary>>, Opts). - %% @doc Retrieve (and cache) block information from Arweave. If the `block' key %% is present, it is used to look up the associated block. If it is of Arweave %% block hash length (43 characters), it is used as an ID. If it is parsable as @@ -289,6 +278,18 @@ find_txid(Base, Request, Opts) -> Opts ). +exclude_data(Base, Request, Opts) -> + RawValue = + hb_ao:get_first( + [ + {Request, <<"exclude-data">>}, + {Base, <<"exclude-data">>} + ], + false, + Opts + ), + hb_util:bool(RawValue). + %% @doc Make a request to the Arweave node and parse the response into an %% AO-Core message. Most Arweave API responses are in JSON format, but without %% a `content-type' header. Subsequently, we parse the response manually and @@ -322,16 +323,20 @@ to_message(Path = <<"/tx/", TXID/binary>>, {ok, #{ <<"body">> := Body }}, Opts) {tx, TXHeader} } ), - {ok, TX} = add_data(TXID, TXHeader, Opts), - { - ok, - hb_message:convert( - TX, - <<"structured@1.0">>, - <<"tx@1.0">>, - Opts - ) - }; + case hb_opts:get(exclude_data, false, Opts) of + true -> + {ok, hb_message:convert(TXHeader, <<"structured@1.0">>, <<"tx@1.0">>, Opts)}; + false -> + case data(TXID, Opts) of + {ok, RawData} -> + TX = TXHeader#tx{ data = RawData }, + {ok, hb_message:convert(TX, <<"structured@1.0">>, <<"tx@1.0">>, Opts)}; + {error, not_found} -> + {ok, hb_message:convert(TXHeader, <<"structured@1.0">>, <<"tx@1.0">>, Opts)}; + Error -> + Error + end + end; to_message(Path = <<"/raw/", _/binary>>, {ok, #{ <<"body">> := Body }}, _Opts) -> ?event( {arweave_raw_response, @@ -464,7 +469,8 @@ get_tx_basic_data_test() -> #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"tx">>, - <<"tx">> => <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">> + <<"tx">> => <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">>, + <<"exclude-data">> => false }, #{} ), @@ -485,6 +491,39 @@ get_tx_basic_data_test() -> ?assert(hb_message:match(ExpectedMsg, StructuredWithHash, only_present)), ok. +get_tx_basic_data_exclude_data_test() -> + {ok, Structured} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"tx">>, + <<"tx">> => <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">>, + <<"exclude-data">> => true + }, + #{} + ), + ?event(debug_test, {structured_tx, Structured}), + ?assert(hb_message:verify(Structured, all, #{})), + ?assertEqual(false, maps:is_key(<<"data">>, Structured)), + ExpectedMsg = #{ + <<"reward">> => <<"482143296">>, + <<"anchor">> => <<"XTzaU2_m_hRYDLiXkcleOC4zf5MVTXIeFWBOsJSRrtEZ8kM6Oz7EKLhZY7fTAvKq">>, + <<"content-type">> => <<"application/json">> + }, + ?assert(hb_message:match(ExpectedMsg, Structured, only_present)), + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"raw">>, + <<"tx">> => <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">> + }, + #{} + ), + StructuredWithData = Structured#{ <<"data">> => Data }, + ?assert(hb_message:verify(StructuredWithData, all, #{})), + DataHash = hb_util:encode(crypto:hash(sha256, Data)), + ?assertEqual(<<"PEShWA1ER2jq7CatAPpOZ30TeLrjOSpaf_Po7_hKPo4">>, DataHash), + ok. + get_tx_rsa_nested_bundle_test() -> Node = hb_http_server:start_node(), Path = <<"/~arweave@2.9-pre/tx=bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, diff --git a/src/dev_arweave_common.erl b/src/dev_arweave_common.erl index 808b41dfc..7ae54889d 100644 --- a/src/dev_arweave_common.erl +++ b/src/dev_arweave_common.erl @@ -178,7 +178,8 @@ maybe_add_bundle_tags(BundleType, TX) -> TX#tx{tags = FilteredBundleTags ++ TX#tx.tags }. %% @doc Reset the data size of a data item. Assumes that the data is already normalized. -normalize_data_size(Item = #tx{data = Bin}) when is_binary(Bin) -> +normalize_data_size(Item = #tx{data = Bin}) + when is_binary(Bin) andalso Bin =/= ?DEFAULT_DATA -> Item#tx{data_size = byte_size(Bin)}; normalize_data_size(Item) -> Item. diff --git a/src/dev_codec_tx.erl b/src/dev_codec_tx.erl index 3363969d2..1a0cf51c5 100644 --- a/src/dev_codec_tx.erl +++ b/src/dev_codec_tx.erl @@ -6,7 +6,8 @@ -include_lib("eunit/include/eunit.hrl"). -define(BASE_FIELDS, [ - <<"anchor">>, <<"format">>, <<"quantity">>, <<"reward">>, <<"target">> ]). + <<"anchor">>, <<"format">>, <<"quantity">>, <<"reward">>, <<"target">>, + <<"data_root">>, <<"data_size">> ]). %% @doc Sign a message using the `priv_wallet' key in the options. Supports both %% the `hmac-sha256' and `rsa-pss-sha256' algorithms, offering unsigned and @@ -169,10 +170,8 @@ to(Other, _Req, _Opts) -> %% those are checked as well (e.g. format is 1 or 2). %% 3. Unsupported fields are set to their default values. %% -%% Of note: for now we require that the `data` field be set on an L1 TX if -%% there is data. In other words we do not allow `data_root` and `data_size` to -%% be set if `data` is *not* set. This differs from the Arweave protocol which -%% explicitly allows TX headers to be validated in the absence of data. +%% Of note: `data_root`/`data_size` are optional for value transfers and should +%% be preserved when present on the header, even if `data` is missing. %% %% When support is added for new fields (e.g. when we add support for ECDSA signatures), %% this function will have to be updated. @@ -262,28 +261,7 @@ enforce_valid_tx(TX) -> throw({invalid_field, tag, InvalidTagForm}) end, TX#tx.tags - ), - enforce_valid_tx_data(TX). - -%% @doc For now we require that the `data` field be set on an L1 TX if -%% there is data. In other words we do not allow `data_root` and `data_size` to -%% be set if `data` is *not* set. This differs from the Arweave protocol which -%% explicitly allows TX headers to be validated in the absence of data. -enforce_valid_tx_data(TX) when TX#tx.data == ?DEFAULT_DATA -> - case TX#tx.data_root =/= ?DEFAULT_DATA_ROOT of - true -> - throw({invalid_field, data_root, TX#tx.data_root}); - false -> - ok - end, - case TX#tx.data_size > 0 of - true -> - throw({invalid_field, data_size, TX#tx.data_size}); - false -> - ok - end; -enforce_valid_tx_data(TX) -> - ok. + ). %%%=================================================================== %%% Tests. @@ -293,7 +271,6 @@ enforce_valid_tx_test() -> BaseTX = #tx{ format = 2 }, InvalidUnsignedID = crypto:strong_rand_bytes(1), - GoodID = crypto:strong_rand_bytes(32), BadID31 = crypto:strong_rand_bytes(31), BadID33 = crypto:strong_rand_bytes(33), BadOwnerSize = crypto:strong_rand_bytes(byte_size(?DEFAULT_OWNER) - 1), @@ -340,9 +317,7 @@ enforce_valid_tx_test() -> {tag_value_not_binary, BaseTX#tx{tags = [{<<"key">>, not_binary}]}, {invalid_field, tag_value, not_binary}}, {tag_value_too_long, BaseTX#tx{tags = [{<<"key">>, TooLongTagValue}]}, {invalid_field, tag_value, TooLongTagValue}}, {invalid_tag_form_atom, BaseTX#tx{tags = [not_a_tuple]}, {invalid_field, tag, not_a_tuple}}, - {invalid_tag_form_list, BaseTX#tx{tags = [[<<"name">>, <<"value">>]]}, {invalid_field, tag, [<<"name">>, <<"value">>]} }, - {data_root_without_data, BaseTX#tx{data_root = GoodID}, {invalid_field, data_root, GoodID}}, - {data_size_without_data, BaseTX#tx{data_size = 1}, {invalid_field, data_size, 1}} + {invalid_tag_form_list, BaseTX#tx{tags = [[<<"name">>, <<"value">>]]}, {invalid_field, tag, [<<"name">>, <<"value">>]} } ], lists:foreach( @@ -396,6 +371,57 @@ happy_tx_test() -> }, do_tx_roundtrips(TX, UnsignedTABM, SignedCommitment). +data_header_but_no_data_test() -> + Anchor = crypto:strong_rand_bytes(32), + Target = crypto:strong_rand_bytes(32), + Data = <<"test-data">>, + DataRoot = ar_tx:data_root(Data), + DataSize = byte_size(Data), + UnsignedTX = #tx{ + format = 2, + anchor = Anchor, + tags = [ + {<<"tag1">>, <<"value1">>} + ], + target = Target, + quantity = 1000, + data_size = DataSize, + data_root = DataRoot, + reward = 2000 + }, + UnsignedTABM = #{ + <<"anchor">> => hb_util:encode(Anchor), + <<"target">> => hb_util:encode(Target), + <<"quantity">> => <<"1000">>, + <<"reward">> => <<"2000">>, + <<"data_root">> => hb_util:encode(DataRoot), + <<"data_size">> => integer_to_binary(DataSize), + <<"tag1">> => <<"value1">> + }, + SignedCommitment = #{ + <<"commitment-device">> => <<"tx@1.0">>, + <<"committed">> => [ + <<"tag1">>, <<"anchor">>, <<"quantity">>, <<"reward">>, + <<"target">>, <<"data_root">>, <<"data_size">>], + <<"type">> => <<"rsa-pss-sha256">>, + <<"bundle">> => <<"false">>, + <<"field-target">> => hb_util:encode(Target), + <<"field-anchor">> => hb_util:encode(Anchor), + <<"field-quantity">> => <<"1000">>, + <<"field-reward">> => <<"2000">>, + <<"field-data_root">> => hb_util:encode(DataRoot), + <<"field-data_size">> => integer_to_binary(DataSize) + }, + do_tx_roundtrips( + UnsignedTX, + UnsignedTABM, + SignedCommitment, + #{ + <<"bundle">> => false, + <<"exclude-data">> => true + } + ). + tag_name_case_test() -> TX = #tx{ format = 2, @@ -844,10 +870,13 @@ flatten_items(_) -> do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment) -> % For tests which don't care about bundling, just use false. do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment, false). -do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment, Bundle) -> - Req = #{ <<"bundle">> => Bundle }, +do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment, Req) when is_map(Req) -> do_unsigned_tx_roundtrip(UnsignedTX, UnsignedTABM, Req), - do_signed_tx_roundtrip(UnsignedTX, UnsignedTABM, Commitment, Req). + do_signed_tx_roundtrip(UnsignedTX, UnsignedTABM, Commitment, Req); +do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment, Bundle) + when is_boolean(Bundle) -> + Req = #{ <<"bundle">> => Bundle }, + do_tx_roundtrips(UnsignedTX, UnsignedTABM, Commitment, Req). do_unsigned_tx_roundtrip(UnsignedTX, UnsignedTABM, Req) -> % Serialize -> Deserialize diff --git a/src/dev_codec_tx_from.erl b/src/dev_codec_tx_from.erl index f21664217..fbe817f2e 100644 --- a/src/dev_codec_tx_from.erl +++ b/src/dev_codec_tx_from.erl @@ -14,7 +14,9 @@ fields(TX, Prefix, Opts) -> target_field(TX, Prefix, Opts), anchor_field(TX, Prefix, Opts), quantity_field(TX, Prefix, Opts), - reward_field(TX, Prefix, Opts) + reward_field(TX, Prefix, Opts), + data_root_field(TX, Prefix, Opts), + data_size_field(TX, Prefix, Opts) ] ). @@ -58,3 +60,16 @@ reward_field(TX, Prefix, _Opts) -> } end. +data_root_field(#tx{data = ?DEFAULT_DATA, data_root = ?DEFAULT_DATA_ROOT}, _Prefix, _Opts) -> + #{}; +data_root_field(#tx{data = ?DEFAULT_DATA, data_root = DataRoot}, Prefix, _Opts) -> + #{<> => hb_util:encode(DataRoot)}; +data_root_field(_TX, _Prefix, _Opts) -> + #{}. + +data_size_field(#tx{data = ?DEFAULT_DATA, data_size = ?DEFAULT_DATA_SIZE}, _Prefix, _Opts) -> + #{}; +data_size_field(#tx{data = ?DEFAULT_DATA, data_size = DataSize}, Prefix, _Opts) -> + #{<> => integer_to_binary(DataSize)}; +data_size_field(_TX, _Prefix, _Opts) -> + #{}. diff --git a/src/dev_codec_tx_to.erl b/src/dev_codec_tx_to.erl index f0fb6f586..da7094985 100644 --- a/src/dev_codec_tx_to.erl +++ b/src/dev_codec_tx_to.erl @@ -9,7 +9,9 @@ fields_to_tx(TX, Prefix, Map, Opts) -> target = target_field(Prefix, Map, Opts), anchor = anchor_field(Prefix, Map, Opts), quantity = quantity_field(Prefix, Map, Opts), - reward = reward_field(Prefix, Map, Opts) + reward = reward_field(Prefix, Map, Opts), + data_root = data_root_field(Prefix, Map, Opts), + data_size = data_size_field(Prefix, Map, Opts) }. format_field(Prefix, Map, Opts) -> @@ -62,11 +64,43 @@ reward_field(Prefix, Map, Opts) -> error -> ?DEFAULT_REWARD end. +data_root_field(Prefix, Map, Opts) -> + case hb_maps:get(<<"data">>, Map, ?DEFAULT_DATA, Opts) of + ?DEFAULT_DATA -> + case hb_maps:find(<>, Map, Opts) of + {ok, EncodedDataRoot} -> + case hb_util:safe_decode(EncodedDataRoot) of + {ok, DataRoot} when ?IS_ID(DataRoot) -> DataRoot; + _ -> ?DEFAULT_DATA_ROOT + end; + error -> ?DEFAULT_DATA_ROOT + end; + _ -> + ?DEFAULT_DATA_ROOT + end. + +data_size_field(Prefix, Map, Opts) -> + case hb_maps:get(<<"data">>, Map, ?DEFAULT_DATA, Opts) of + ?DEFAULT_DATA -> + case hb_maps:find(<>, Map, Opts) of + {ok, EncodedDataSize} -> + case hb_util:safe_int(EncodedDataSize) of + {ok, DataSize} -> DataSize; + _ -> ?DEFAULT_DATA_SIZE + end; + error -> ?DEFAULT_DATA_SIZE + end; + _ -> + ?DEFAULT_DATA_SIZE + end. + excluded_tags(TX, TABM, Opts) -> exclude_target_tag(TX, TABM, Opts) ++ exclude_anchor_tag(TX, TABM, Opts) ++ exclude_quantity_tag(TX, TABM, Opts) ++ - exclude_reward_tag(TX, TABM, Opts). + exclude_reward_tag(TX, TABM, Opts) ++ + exclude_data_root_tag(TX) ++ + exclude_data_size_tag(TX). exclude_target_tag(TX, TABM, Opts) -> case {TX#tx.target, hb_maps:get(<<"target">>, TABM, undefined, Opts)} of @@ -98,4 +132,16 @@ exclude_reward_tag(TX, TABM, Opts) -> {FieldReward, TagReward} when FieldReward =/= TagReward -> [<<"reward">>]; _ -> [] + end. + +exclude_data_root_tag(TX) -> + case TX#tx.data_root of + ?DEFAULT_DATA_ROOT -> []; + _ -> [<<"data_root">>] + end. + +exclude_data_size_tag(TX) -> + case TX#tx.data_size of + ?DEFAULT_DATA_SIZE -> []; + _ -> [<<"data_size">>] end. \ No newline at end of file diff --git a/src/hb_util.erl b/src/hb_util.erl index c920cd699..a6669715a 100644 --- a/src/hb_util.erl +++ b/src/hb_util.erl @@ -1,6 +1,6 @@ %% @doc A collection of utility functions for building with HyperBEAM. -module(hb_util). --export([int/1, float/1, atom/1, bin/1, list/1, map/1, bool_int/1]). +-export([int/1, float/1, atom/1, bin/1, list/1, map/1, bool/1, bool_int/1]). -export([safe_int/1]). -export([ceil_int/2, floor_int/2]). -export([id/1, id/2, native_id/1, human_id/1, human_int/1, to_hex/1]). @@ -81,6 +81,20 @@ bin(Value) when is_list(Value) -> bin(Value) when is_binary(Value) -> Value. +%% @doc Coerce a value to a boolean. +bool(Value) -> + case Value of + true -> true; + false -> false; + <<"true">> -> true; + <<"false">> -> false; + <<"1">> -> true; + <<"0">> -> false; + 1 -> true; + 0 -> false; + _ -> false + end. + %% @doc Coerce a boolean to 1 or 0. bool_int(true) -> 1; bool_int(false) -> 0. diff --git a/src/include/ar.hrl b/src/include/ar.hrl index 751acef95..6406b6f51 100644 --- a/src/include/ar.hrl +++ b/src/include/ar.hrl @@ -14,6 +14,7 @@ -define(DEFAULT_ANCHOR, <<>>). -define(DEFAULT_TARGET, <<>>). -define(DEFAULT_DATA_ROOT, <<>>). +-define(DEFAULT_DATA_SIZE, 0). -define(DEFAULT_QUANTITY, 0). -define(DEFAULT_REWARD, 0). @@ -54,7 +55,7 @@ data = ?DEFAULT_DATA, manifest = undefined, %% Size in bytes of the transaction data. - data_size = 0, + data_size = ?DEFAULT_DATA_SIZE, %% Deprecated. Not used, not gossiped. data_tree = [], %% The Merkle root of the Merkle tree of data chunks. From 7d6137b59829b0ca31f1768f1ec68d7eafc1f3a8 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 19 Jan 2026 13:20:59 -0500 Subject: [PATCH 06/29] wip: write and read TX-bundle to hb_store_arweave --- src/ar_format.erl | 2 +- src/ar_tx.erl | 2 +- src/dev_arweave.erl | 3 -- src/hb_store_arweave.erl | 59 ++++++++++++++++++++++++++++++++++------ 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/ar_format.erl b/src/ar_format.erl index 5950e8f4a..e32e18eb5 100644 --- a/src/ar_format.erl +++ b/src/ar_format.erl @@ -18,7 +18,7 @@ format(TX, Indent, Opts) when is_list(TX); is_map(TX) -> format(TX, Indent, Opts) when is_record(TX, tx) -> MustVerify = hb_opts:get(debug_ids, true, Opts), Valid = - if MustVerify -> verify(TX); + if MustVerify -> verify(dev_arweave_common:normalize(TX)); true -> true end, UnsignedID = diff --git a/src/ar_tx.erl b/src/ar_tx.erl index 205bb85a8..2e444810b 100644 --- a/src/ar_tx.erl +++ b/src/ar_tx.erl @@ -206,7 +206,7 @@ verify_hash(#tx{ id = ID } = TX) -> ID == dev_arweave_common:generate_id(TX, signed). %% @doc On Arweave we don't have data on format=2 transactions, and so -%% traditionally just verify the transcation based on data_rot and data_size. +%% traditionally just verify the transaction based on data_root and data_size. %% However in HyperBEAM we will often populate the data field. Adding this %% check to verify that `data_root`, `data_size`, and `data` are consistent. verify_v2_data(#tx{ format = 2, data = ?DEFAULT_DATA }) -> diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index 4e931f1a5..2090263f4 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -530,12 +530,10 @@ get_tx_rsa_nested_bundle_test() -> {ok, Root} = hb_http:get(Node, Path, #{}), ?event(debug_test, {root, Root}), ?assert(hb_message:verify(Root, all, #{})), - ChildPath = <>, {ok, Child} = hb_http:get(Node, ChildPath, #{}), ?event(debug_test, {child, Child}), ?assert(hb_message:verify(Child, all, #{})), - {ok, ExpectedChild} = hb_ao:resolve( Root, @@ -543,7 +541,6 @@ get_tx_rsa_nested_bundle_test() -> #{} ), ?assert(hb_message:match(ExpectedChild, Child, only_present)), - ManualChild = #{ <<"data">> => <<"{\"totalTickedRewardsDistributed\":0,\"distributedEpochIndexes\":[],\"newDemandFactors\":[],\"newEpochIndexes\":[],\"tickedRewardDistributions\":[],\"newPruneGatewaysResults\":[{\"delegateStakeReturned\":0,\"stakeSlashed\":0,\"gatewayStakeReturned\":0,\"delegateStakeWithdrawing\":0,\"prunedGateways\":[],\"slashedGateways\":[],\"gatewayStakeWithdrawing\":0}]}">>, <<"data-protocol">> => <<"ao">>, diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index 0b9e224d0..937d16549 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -18,10 +18,10 @@ read(StoreOpts = #{ <<"arweave-index-store">> := IndexStore }, ID) -> debug_test, {reading_offset, {is_tx, IsTX}, {offset, Offset}, {length, Length}} ), - case IsTX of - <<"1">> -> + case hb_util:bool(IsTX) of + true -> load_bundle(ID, Offset, Length, StoreOpts); - <<"0">> -> + false -> load_item(Offset, Length, StoreOpts) end; {error, not_found} -> @@ -31,16 +31,27 @@ read(StoreOpts = #{ <<"arweave-index-store">> := IndexStore }, ID) -> load_item(Offset, Length, Opts) -> case read_chunks(Offset, Length, Opts) of {ok, SerializedItem} -> - ar_bundles:deserialize(SerializedItem); + to_message(ar_bundles:deserialize(SerializedItem), Opts); {error, Reason} -> {error, Reason} end. load_bundle(ID, Offset, Length, Opts) -> - + {ok, StructuredTXHeader} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ <<"path">> => <<"tx">>, <<"tx">> => ID, <<"exclude-data">> => true }, + Opts + ), + TXHeader = hb_message:convert( + StructuredTXHeader, + <<"tx@1.0">>, + <<"structured@1.0">>, + Opts), case read_chunks(Offset, Length, Opts) of {ok, SerializedItem} -> - ar_bundles:deserialize(SerializedItem); + to_message( + ar_bundles:deserialize(TXHeader#tx{ data = SerializedItem }), + Opts); {error, Reason} -> {error, Reason} end. @@ -56,6 +67,17 @@ read_chunks(Offset, Length, Opts) -> Opts ). +to_message(TX, Opts) -> + { + ok, + hb_message:convert( + TX, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ) + }. + write_offset(Store, ID, IsTX, Offset, Length) -> IsTxInt = hb_util:bool_int(IsTX), hb_store:write( @@ -90,6 +112,25 @@ write_read_tx_test() -> Offset = 363524457275639, Length = 8387, ok = write_offset(Store, ID, true, Offset, Length), - {ok, Item} = read(Opts, ID), - ?event(debug_test, {item, Item}), - ok. \ No newline at end of file + {ok, Bundle} = read(Opts, ID), + ?assert(hb_message:verify(Bundle, all, #{})), + {ok, Child} = + hb_ao:resolve( + Bundle, + <<"1/2">>, + #{} + ), + ?assert(hb_message:verify(Child, all, #{})), + ExpectedChild = #{ + <<"data">> => <<"{\"totalTickedRewardsDistributed\":0,\"distributedEpochIndexes\":[],\"newDemandFactors\":[],\"newEpochIndexes\":[],\"tickedRewardDistributions\":[],\"newPruneGatewaysResults\":[{\"delegateStakeReturned\":0,\"stakeSlashed\":0,\"gatewayStakeReturned\":0,\"delegateStakeWithdrawing\":0,\"prunedGateways\":[],\"slashedGateways\":[],\"gatewayStakeWithdrawing\":0}]}">>, + <<"data-protocol">> => <<"ao">>, + <<"from-module">> => <<"cbn0KKrBZH7hdNkNokuXLtGryrWM--PjSTBqIzw9Kkk">>, + <<"from-process">> => <<"agYcCFJtrMG6cqMuZfskIkFTGvUPddICmtQSBIoPdiA">>, + <<"anchor">> => <<"MDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAyODAxODg">>, + <<"reference">> => <<"280188">>, + <<"target">> => <<"1R5QEtX53Z_RRQJwzFWf40oXiPW2FibErT_h02pu8MU">>, + <<"type">> => <<"Message">>, + <<"variant">> => <<"ao.TN.1">> + }, + ?assert(hb_message:match(ExpectedChild, Child, only_present)), + ok. From 3438a06f8aa95f2a1c2992f48a2ba71b6feb22d0 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Wed, 21 Jan 2026 17:26:49 -0500 Subject: [PATCH 07/29] wip: working on getting a single data item from a bundle to load --- src/ar_block.erl | 59 +++++++++++ src/ar_bundles.erl | 21 ++-- src/ar_fork.erl | 173 ++++++++++++++++++++++++++++++++ src/ar_poa.erl | 18 ++++ src/ar_tx.erl | 18 ++++ src/dev_arweave.erl | 98 +++++++++++------- src/dev_copycat_arweave.erl | 194 ++++++++++++++++++++++++++++-------- src/hb_store_arweave.erl | 99 ++++++++++-------- src/include/ar.hrl | 5 + 9 files changed, 557 insertions(+), 128 deletions(-) create mode 100644 src/ar_block.erl create mode 100644 src/ar_fork.erl create mode 100644 src/ar_poa.erl diff --git a/src/ar_block.erl b/src/ar_block.erl new file mode 100644 index 000000000..c36229b9c --- /dev/null +++ b/src/ar_block.erl @@ -0,0 +1,59 @@ +%%% @doc Copied and adapted from the arweave codebase. +%%% Should track: https://github.com/ArweaveTeam/arweave/blob/master/apps/arweave/src/ar_block.erl +-module(ar_block). + +-export([strict_data_split_threshold/0, get_chunk_padded_offset/1, generate_size_tagged_list_from_txs/2]). + +-include("include/ar.hrl"). + +%%%=================================================================== +%%% Public interface. +%%%=================================================================== + +strict_data_split_threshold() -> ?STRICT_DATA_SPLIT_THRESHOLD. + +%% @doc Return Offset if it is smaller than or equal to ar_block:strict_data_split_threshold(). +%% Otherwise, return the offset of the last byte of the chunk + the size of the padding. +-spec get_chunk_padded_offset(Offset :: non_neg_integer()) -> non_neg_integer(). +get_chunk_padded_offset(Offset) -> + case Offset > ar_block:strict_data_split_threshold() of + true -> + ar_poa:get_padded_offset(Offset, ar_block:strict_data_split_threshold()); + false -> + Offset + end. + +generate_size_tagged_list_from_txs(TXs, Height) -> + lists:reverse( + element(2, + lists:foldl( + fun(TX, {Pos, List}) -> + DataSize = TX#tx.data_size, + End = Pos + DataSize, + case Height >= ar_fork:height_2_5() of + true -> + Padding = ar_tx:get_weave_size_increase(DataSize, Height) + - DataSize, + %% Encode the padding information in the Merkle tree. + case Padding > 0 of + true -> + PaddingRoot = <<>>, + {End + Padding, [{{padding, PaddingRoot}, End + Padding}, + {{TX, get_tx_data_root(TX)}, End} | List]}; + false -> + {End, [{{TX, get_tx_data_root(TX)}, End} | List]} + end; + false -> + {End, [{{TX, get_tx_data_root(TX)}, End} | List]} + end + end, + {0, []}, + lists:sort(TXs) + ) + ) + ). + +get_tx_data_root(#tx{ format = 2, data_root = DataRoot }) -> + DataRoot; +get_tx_data_root(TX) -> + (ar_tx:generate_chunk_tree(TX))#tx.data_root. \ No newline at end of file diff --git a/src/ar_bundles.erl b/src/ar_bundles.erl index fc838ebaa..a692e15dd 100644 --- a/src/ar_bundles.erl +++ b/src/ar_bundles.erl @@ -5,6 +5,7 @@ -export([encode_tags/1, decode_tags/1]). -export([serialize/1, deserialize/1, serialize_bundle/3]). -export([data_item_signature_data/1]). +-export([decode_bundle_header/1]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -467,7 +468,7 @@ find_single_layer(UnsignedID, Items) -> end. unbundle(<>) -> - {ItemsBin, Items} = decode_bundle_header(Count, Content), + {ItemsBin, Items, _HeaderSize} = decode_bundle_header(Count, Content), decode_bundle_items(Items, ItemsBin); unbundle(?DEFAULT_DATA) -> ?DEFAULT_DATA. @@ -487,11 +488,19 @@ decode_bundle_items([{_ID, Size} | RestItems], ItemsBin) -> ) ]. -decode_bundle_header(Count, Bin) -> decode_bundle_header(Count, Bin, []). -decode_bundle_header(0, ItemsBin, Header) -> - {ItemsBin, lists:reverse(Header)}; -decode_bundle_header(Count, <>, Header) -> - decode_bundle_header(Count - 1, Rest, [{ID, Size} | Header]). +decode_bundle_header(<>) -> + decode_bundle_header(Count, Content). + +decode_bundle_header(Count, Bin) -> decode_bundle_header(Count, Bin, 32, []). +decode_bundle_header(0, ItemsBin, HeaderSize, Header) -> + {ItemsBin, lists:reverse(Header), HeaderSize}; +decode_bundle_header( + Count, + <>, + HeaderSize, + Header +) -> + decode_bundle_header(Count - 1, Rest, HeaderSize + 64, [{ID, Size} | Header]). %% @doc Decode the signature from a binary format. Only RSA 4096 is currently supported. %% Note: the signature type '1' corresponds to RSA 4096 - but it is is written in diff --git a/src/ar_fork.erl b/src/ar_fork.erl new file mode 100644 index 000000000..0209cae57 --- /dev/null +++ b/src/ar_fork.erl @@ -0,0 +1,173 @@ +%%% +%%% @doc The module defines Arweave hard forks' heights. +%%% + +-module(ar_fork). + +-export([height_1_6/0, height_1_7/0, height_1_8/0, height_1_9/0, height_2_0/0, height_2_2/0, + height_2_3/0, height_2_4/0, height_2_5/0, height_2_6/0, height_2_6_8/0, + height_2_7/0, height_2_7_1/0, height_2_7_2/0, + height_2_8/0, height_2_9/0]). + +-ifdef(FORKS_RESET). +height_1_6() -> + 0. +-else. +height_1_6() -> + 95000. +-endif. + +-ifdef(FORKS_RESET). +height_1_7() -> + 0. +-else. +height_1_7() -> + 235200. % Targeting 2019-07-08 UTC +-endif. + +-ifdef(FORKS_RESET). +height_1_8() -> + 0. +-else. +height_1_8() -> + 269510. % Targeting 2019-08-29 UTC +-endif. + +-ifdef(FORKS_RESET). +height_1_9() -> + 0. +-else. +height_1_9() -> + 315700. % Targeting 2019-11-04 UTC +-endif. + +-ifdef(FORKS_RESET). +height_2_0() -> + 0. +-else. +height_2_0() -> + 422250. % Targeting 2020-04-09 10:00 UTC +-endif. + +-ifdef(FORKS_RESET). +height_2_2() -> + 0. +-else. +height_2_2() -> + 552180. % Targeting 2020-10-21 13:00 UTC +-endif. + +-ifdef(FORKS_RESET). +height_2_3() -> + 0. +-else. +height_2_3() -> + 591140. % Targeting 2020-12-21 11:00 UTC +-endif. + +-ifdef(FORKS_RESET). +height_2_4() -> + 0. +-else. +height_2_4() -> + 633720. % Targeting 2021-02-24 11:50 UTC +-endif. + +-ifdef(FORKS_RESET). +height_2_5() -> + 0. +-else. +height_2_5() -> + 812970. +-endif. + +-ifdef(FORK_2_6_HEIGHT). +height_2_6() -> + ?FORK_2_6_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_6() -> + 0. + -else. + height_2_6() -> + 1132210. % Targeting 2023-03-06 14:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_6_8_HEIGHT). +height_2_6_8() -> + ?FORK_2_6_8_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_6_8() -> + 0. + -else. + height_2_6_8() -> + 1189560. % Targeting 2023-05-30 16:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_7_HEIGHT). +height_2_7() -> + ?FORK_2_7_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_7() -> + 0. + -else. + height_2_7() -> + 1275480. % Targeting 2023-10-04 14:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_7_1_HEIGHT). +height_2_7_1() -> + ?FORK_2_7_1_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_7_1() -> + 0. + -else. + height_2_7_1() -> + 1316410. % Targeting 2023-12-05 14:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_7_2_HEIGHT). +height_2_7_2() -> + ?FORK_2_7_2_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_7_2() -> + 0. + -else. + height_2_7_2() -> + 1391330. % Targeting 2024-03-26 14:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_8_HEIGHT). +height_2_8() -> + ?FORK_2_8_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_8() -> + 0. + -else. + height_2_8() -> + 1547120. % Targeting 2024-11-13 14:00 UTC + -endif. +-endif. + +-ifdef(FORK_2_9_HEIGHT). +height_2_9() -> + ?FORK_2_9_HEIGHT. +-else. + -ifdef(FORKS_RESET). + height_2_9() -> + 0. + -else. + height_2_9() -> + 1602350. % Targeting 2025-02-03 14:00 UTC + -endif. +-endif. diff --git a/src/ar_poa.erl b/src/ar_poa.erl new file mode 100644 index 000000000..8a88d8e90 --- /dev/null +++ b/src/ar_poa.erl @@ -0,0 +1,18 @@ +%%% @doc This module implements all mechanisms required to validate a proof of access +%%% for a chunk of data received from the network. +-module(ar_poa). + +-export([get_padded_offset/1, get_padded_offset/2]). + +-include("include/ar.hrl"). + +%% @doc Return the smallest multiple of 256 KiB >= Offset +%% counting from ar_block:strict_data_split_threshold(). +get_padded_offset(Offset) -> + get_padded_offset(Offset, ar_block:strict_data_split_threshold()). + +%% @doc Return the smallest multiple of 256 KiB >= Offset +%% counting from StrictDataSplitThreshold. +get_padded_offset(Offset, StrictDataSplitThreshold) -> + Diff = Offset - StrictDataSplitThreshold, + StrictDataSplitThreshold + ((Diff - 1) div (?DATA_CHUNK_SIZE) + 1) * (?DATA_CHUNK_SIZE). diff --git a/src/ar_tx.erl b/src/ar_tx.erl index 2e444810b..aa3ae8bc1 100644 --- a/src/ar_tx.erl +++ b/src/ar_tx.erl @@ -5,7 +5,9 @@ -export([id/1, id/2, get_owner_address/1, data_root/1]). -export([generate_signature_data_segment/1, generate_chunk_id/1]). -export([json_struct_to_tx/1, tx_to_json_struct/1]). +-export([generate_chunk_tree/1, generate_chunk_tree/2]). -export([chunk_binary/2, chunks_to_size_tagged_chunks/1, sized_chunks_to_sized_chunk_ids/1]). +-export([get_weave_size_increase/2]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -405,6 +407,22 @@ chunks_to_size_tagged_chunks(Chunks) -> sized_chunks_to_sized_chunk_ids(SizedChunks) -> [{generate_chunk_id(Chunk), Size} || {Chunk, Size} <- SizedChunks]. +%% @doc Return the number of bytes the weave is increased by when the given transaction +%% is included. +get_weave_size_increase(#tx{ data_size = DataSize }, Height) -> + get_weave_size_increase(DataSize, Height); + +get_weave_size_increase(0, _Height) -> + 0; +get_weave_size_increase(DataSize, Height) -> + case Height >= ar_fork:height_2_5() of + true -> + %% The smallest multiple of ?DATA_CHUNK_SIZE larger than or equal to data_size. + ar_poa:get_padded_offset(DataSize, 0); + false -> + DataSize + end. + %%%=================================================================== %%% Tests. %%%=================================================================== diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index 2090263f4..c7fe07d1b 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -9,6 +9,9 @@ -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). +-define(IS_BLOCK_ID(X), (is_binary(X) andalso byte_size(X) == 64)). + + %% @doc Proxy the `/info' endpoint from the Arweave node. status(_Base, _Request, Opts) -> request(<<"GET">>, <<"/info">>, Opts). @@ -147,15 +150,23 @@ post_chunk(_Base, Request, Opts) -> get_chunk_range(_Base, Request, Opts) -> Offset = hb_util:int(hb_ao:get(<<"offset">>, Request, Opts)), + % get_chunk_range will attempt to query Length *or more* bytes, it will + % accumulate all the bytes that are returned by the gateway which in + % some cases will exceed the requested length. Length = hb_util:int(hb_ao:get(<<"length">>, Request, ?DATA_CHUNK_SIZE, Opts)), case get_chunk_range(Offset, Length, Opts, [], 0) of {ok, Chunks} -> Data = iolist_to_binary(Chunks), - Truncated = - case byte_size(Data) > Length of - true -> binary:part(Data, 0, Length); - false -> Data - end, + StartOffset = ar_block:get_chunk_padded_offset(Offset) - ?DATA_CHUNK_SIZE + 1, + StartGap = Offset - StartOffset, + TruncatedLength = min(Length, byte_size(Data) - StartGap), + ?event(debug_test, { + get_chunk_range, + {offset, Offset}, {length, Length}, {data_size, byte_size(Data)}, + {start_offset, StartOffset}, {start_gap, StartGap}, + {truncated_length, TruncatedLength} + }), + Truncated = binary:part(Data, StartGap, TruncatedLength), {ok, Truncated}; {error, Reason} -> {error, Reason} @@ -168,11 +179,11 @@ get_chunk_range(Offset, Length, Opts, Chunks, Size) -> case get_chunk(Offset, Opts) of {ok, Chunk} -> get_chunk_range( - Offset + byte_size(Chunk), + Offset + ?DATA_CHUNK_SIZE, Length, Opts, [Chunk | Chunks], - Size + byte_size(Chunk) + Size + ?DATA_CHUNK_SIZE ); {error, Reason} -> {error, Reason} @@ -181,6 +192,7 @@ get_chunk_range(Offset, Length, Opts, Chunks, Size) -> get_chunk(Offset, Opts) -> Path = <<"/chunk/", (hb_util:bin(Offset))/binary>>, Res = request(<<"GET">>, Path, Opts), + ?event(debug_test, {{path, Path}, {get_chunk_response, Res}}), case Res of {ok, JSON } -> Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), @@ -207,10 +219,10 @@ block(Base, Request, Opts) -> case Block of <<"current">> -> current(Base, Request, Opts); not_found -> current(Base, Request, Opts); - ID when ?IS_ID(ID) -> block({id, ID}, Opts); + ID when ?IS_BLOCK_ID(ID) -> block({id, ID}, Opts); MaybeHeight -> try hb_util:int(MaybeHeight) of - Int -> block({height, Int}, Opts) + Int -> block({height, Int}, Opts) catch _:_ -> { @@ -628,7 +640,8 @@ get_partial_chunk_test() -> }, Opts ), - ?assertEqual(ExpectedData, Data), + ?assertEqual(?DATA_CHUNK_SIZE, byte_size(Data)), + ?assertEqual(ExpectedData, binary:part(Data, 0, ExpectedLength)), ok after hb_mock_server:stop(ServerHandle) @@ -657,31 +670,40 @@ get_full_chunk_test() -> end. get_multi_chunk_test() -> - Offset = 377813969707255, - ExpectedLength = 526685, - Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), - Chunk1 = crypto:strong_rand_bytes(132270), - Chunk2 = crypto:strong_rand_bytes(132271), - ChunkData = #{ - Offset => Chunk0, - Offset + ?DATA_CHUNK_SIZE => Chunk1, - Offset + ?DATA_CHUNK_SIZE + 132270 => Chunk2 - }, - {ServerHandle, Opts} = start_mock_gateway(ChunkData), - try - ExpectedData = binary:part( - iolist_to_binary([Chunk0, Chunk1, Chunk2]), 0, ExpectedLength), - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - Opts - ), - ?assertEqual(ExpectedData, Data), - ok - after - hb_mock_server:stop(ServerHandle) - end. \ No newline at end of file + %% http://tip-1.arweave.xyz:1984/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset + %% + Offset = 378092137521399, + ExpectedLength = 1264430, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?event(debug_test, {{data, byte_size(Data)}, {expected_length, ExpectedLength}}), + % ?assertEqual(ExpectedData, Data), + ok. + +%% @doc Query a chunk range that starts and ends in the middle of a chunk. +get_mid_chunk_test() -> + %% http://tip-1.arweave.xyz:1984/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset + %% + Offset = 378092137521399 + 200_000, + ExpectedLength = ?DATA_CHUNK_SIZE + 300_000, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?event(debug_test, {{data, byte_size(Data)}, {expected_length, ExpectedLength}}), + % ?assertEqual(ExpectedData, Data), + ok. \ No newline at end of file diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index a476994b7..f64e85fd9 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -81,49 +81,159 @@ process_block(BlockRes, _Req, Current, To, Opts) -> end. %% @doc Index the IDs of all transactions in the block if configured to do so. -% maybe_index_ids(Block, Opts) -> -% case hb_opts:get(arweave_index_ids, false, Opts) of -% false -> ok; -% true -> -% IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), -% BlockOffset = hb_maps:get(<<"weave_size">>, Block, 0, Opts), -% lists:foreach( -% fun(TXID) -> -% TX = -% hb_ao:get( -% << -% ?ARWEAVE_DEVICE/binary, -% "/tx=", -% (hb_util:bin(TXID))/binary -% >>, -% Opts -% ), -% TXOffset = hb_maps:get(<<"offset">>, TX, 0, Opts), -% case is_bundle_tx(TX, Opts) of -% false -> ok; -% true -> -% {ok, BundleIndex} = download_bundle_header(TXID, Opts), -% hb_maps:map( -% fun(ItemID, #{ <<"offset">> := BundleOffset, <<"length">> := Length}) -> -% Offset = hb_util:bin(BundleOffset + TXOffset + BlockOffset), -% hb_store_arweave:write_offset( -% IndexStore, -% ItemID, -% Offset, -% Length -% ) -% end, -% BundleIndex, -% Opts -% ) -% end -% end, -% hb_maps:get(<<"txs">>, Block, #{}, Opts), -% Opts -% ) -% end. -% ok. +maybe_index_ids(Block, Opts) -> + case hb_opts:get(arweave_index_ids, false, Opts) of + false -> ok; + true -> + IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), + BlockEndOffset = hb_util:int( + hb_maps:get(<<"weave_size">>, Block, 0, Opts)), + BlockSize = hb_util:int( + hb_maps:get(<<"block_size">>, Block, 0, Opts)), + BlockStartOffset = BlockEndOffset - BlockSize, + ?event(debug_test, { + {block_end_offset, BlockEndOffset}, + {block_size, BlockSize}, + {block_start_offset, BlockStartOffset} + }), + TXs = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), + Height = hb_maps:get(<<"height">>, Block, 0, Opts), + TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), + lists:foreach(fun + ({{padding, _PaddingRoot}, _EndOffset}) -> + ok; + ({{TX, _TXDataRoot}, EndOffset}) -> + ?event(debug_test, { + {tx, TX}, + {end_offset, BlockStartOffset + EndOffset}, + {tx_size, TX#tx.data_size}, + {is_bundle_tx, is_bundle_tx(TX, Opts)} + }), + case is_bundle_tx(TX, Opts) of + false -> ok; + true -> + TXEndOffset = BlockStartOffset + EndOffset, + TXStartOffset = TXEndOffset - TX#tx.data_size, + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(TX#tx.id), + true, + TXEndOffset, + TX#tx.data_size + ), + {ok, {BundleIndex, HeaderSize}} = download_bundle_header( + TXEndOffset, TX#tx.data_size, Opts), + ?event(debug_test, {{bundle_index, BundleIndex}, {header_size, HeaderSize}}), + lists:foldl( + fun({ItemID, Size}, OffsetAcc) -> + ItemEndOffset = OffsetAcc + Size, + ?event(debug_test, { + {item_id, {explicit, hb_util:encode(ItemID)}}, + {item_end_offset, ItemEndOffset}, + {size, Size} + }), + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + false, + ItemEndOffset, + Size + ), + ItemEndOffset + end, + TXStartOffset + HeaderSize, + BundleIndex + ) + end + end, + TXsWithData + ), + ok + end. + +is_bundle_tx(TX, _Opts) -> + dev_arweave_common:type(TX) =/= binary. +download_bundle_header(EndOffset, Size, Opts) -> + StartOffset = EndOffset - Size + 1, + {ok, Chunk} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => StartOffset + }, + Opts + ), + {_ItemsBin, BundleIndex, HeaderSize} = ar_bundles:decode_bundle_header(Chunk), + {ok, {BundleIndex, HeaderSize}}. + +resolve_tx_headers(TXIDs, Opts) -> + lists:map( + fun(TXID) -> + {ok, StructuredTXHeader} = + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"tx">>, + <<"tx">> => TXID, + <<"exclude-data">> => true + }, + Opts + ), + hb_message:convert( + StructuredTXHeader, + <<"tx@1.0">>, + <<"structured@1.0">>, + Opts) + end, + TXIDs + ). %%% Tests +index_ids_test() -> + %% Test block: https://viewblock.io/arweave/block/1827942 + %% Note: this block includes a data item with an Ethereum signature. This + %% signature type is not yet (as of Jan 2026) supported by ar_bundles.erl, + %% however we should still be able to index it (we just can't deserialize + %% it). + Store = [hb_test_utils:test_store()], + StoreOpts = #{ <<"index-store">> => Store }, + Opts = #{ + arweave_index_ids => true, + arweave_index_store => StoreOpts + }, + {ok, Block} = hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/block=5Ya_2_jLshzNodGRVrBwlAhcEowIFgVbvOcN4j_MJI4QfodQ7Nd8ke7CMN9OnpK0" + >>, + Opts + ), + ?event(debug_test, {Block}), + ?assertEqual(ok, maybe_index_ids(Block, Opts)), + + % Bundles with unsupprted signatures should still be indexed, but when + % we go to desiarlize the data it will fail. + % ?assertEqual({badmatch, unsupported_tx_format}, + % hb_store_arweave:read( + % StoreOpts, + % <<"kK67S13W_8jM9JUw2umVamo0zh9v1DeVxWrru2evNco">>)), + + % This is a single item bundle with nested items. The immediately + % bundled item is indexed, the nested items are not. + {ok, TX5} = hb_store_arweave:read(StoreOpts, <<"c2ATDuTgwKCcHpAFZqSt13NC-tA4hdA7Aa2xBPuOzoE">>), + ?assertEqual(<<"871231847">>, hb_maps:get(<<"reward">>, TX5, #{})), + ?assert(hb_message:verify(TX5, all, #{})), + TX5Item = hb_ao:get(<<"1">>, TX5, #{}), + TX5ItemID = hb_message:id(TX5Item, signed), + ?assertEqual(<<"OBKr-7UrmjxFD-h-qP-XLuvCgtyuO_IDpBMgIytvusA">>, TX5ItemID), + TX5ItemRead = hb_store_arweave:read(StoreOpts, <<"XJq09oboLC7Z5LQ0lsg2klHa3pkCW_H1kWeBnrMLmfc">>), + ?event(debug_test, {{id, TX5ItemID}, {read, TX5ItemRead}}), + ?assert(hb_message:verify(TX5ItemRead, all, #{})), + ?assertEqual(TX5Item, TX5ItemRead), + + + % ?assert(hb_message:verify(TX5Item, all, #{})), + % {error not_found} = hb_store_arweave:read(StoreOpts, hb_message:id(TX5Item, signed)), + ok. \ No newline at end of file diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index 937d16549..a3904dc04 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -10,33 +10,47 @@ -define(ARWEAVE_INDEX_PATH, <<"~arweave@2.9-pre/offset">>). -read(StoreOpts = #{ <<"arweave-index-store">> := IndexStore }, ID) -> +read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> case hb_store:read(IndexStore, path(ID)) of {ok, Binary} -> - [IsTX, Offset, Length] = binary:split(Binary, <<":">>, [global]), + % EndOffset and Size is recorded (rather than StartOffset and + % Length) to preserve consistency with the Arweave API's + % `/tx//offset` endpoint. + [IsTX, EndOffset, Size] = binary:split(Binary, <<":">>, [global]), ?event( debug_test, - {reading_offset, {is_tx, IsTX}, {offset, Offset}, {length, Length}} + {reading_offset, {is_tx, IsTX}, + {end_offset, EndOffset}, {size, Size}} ), case hb_util:bool(IsTX) of true -> - load_bundle(ID, Offset, Length, StoreOpts); + load_bundle(ID, + hb_util:int(EndOffset), hb_util:int(Size), StoreOpts); false -> - load_item(Offset, Length, StoreOpts) + load_item( + hb_util:int(EndOffset), hb_util:int(Size), StoreOpts) end; {error, not_found} -> {error, not_found} end. -load_item(Offset, Length, Opts) -> - case read_chunks(Offset, Length, Opts) of +load_item(EndOffset, Size, Opts) -> + case read_chunks(EndOffset, Size, Opts) of {ok, SerializedItem} -> - to_message(ar_bundles:deserialize(SerializedItem), Opts); + { + ok, + hb_message:convert( + ar_bundles:deserialize(SerializedItem), + <<"structured@1.0">>, + <<"ans104@1.0">>, + Opts + ) + }; {error, Reason} -> {error, Reason} end. -load_bundle(ID, Offset, Length, Opts) -> +load_bundle(ID, EndOffset, Size, Opts) -> {ok, StructuredTXHeader} = hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"tx">>, <<"tx">> => ID, <<"exclude-data">> => true }, @@ -47,50 +61,49 @@ load_bundle(ID, Offset, Length, Opts) -> <<"tx@1.0">>, <<"structured@1.0">>, Opts), - case read_chunks(Offset, Length, Opts) of + case read_chunks(EndOffset, Size, Opts) of {ok, SerializedItem} -> - to_message( - ar_bundles:deserialize(TXHeader#tx{ data = SerializedItem }), - Opts); + { + ok, + hb_message:convert( + TXHeader#tx{ data = SerializedItem }, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ) + }; {error, Reason} -> {error, Reason} end. -read_chunks(Offset, Length, Opts) -> +read_chunks(EndOffset, Size, Opts) -> + StartOffset = EndOffset - Size + 1, hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => Length + <<"offset">> => StartOffset, + <<"length">> => Size }, Opts ). -to_message(TX, Opts) -> - { - ok, - hb_message:convert( - TX, - <<"structured@1.0">>, - <<"tx@1.0">>, - Opts - ) - }. -write_offset(Store, ID, IsTX, Offset, Length) -> +%% @doc When recording an item or bundle's offset we use its EndOffset and +%% Size in rather than StartOffset and Length - this is for consistency with +%% the Arweave API's `/tx//offset` endpoint which returns a global +%% end offset and size. +write_offset(#{ <<"index-store">> := IndexStore }, ID, IsTX, EndOffset, Size) -> IsTxInt = hb_util:bool_int(IsTX), - hb_store:write( - Store, - path(ID), - << - (hb_util:bin(IsTxInt))/binary, - ":", - (hb_util:bin(Offset))/binary, - ":", - (hb_util:bin(Length))/binary - >> - ). + Value = << + (hb_util:bin(IsTxInt))/binary, + ":", + (hb_util:bin(EndOffset))/binary, + ":", + (hb_util:bin(Size))/binary + >>, + ?event(debug_test, {value, Value}), + hb_store:write(IndexStore, path(ID), Value). path(ID) -> << @@ -106,12 +119,12 @@ write_read_tx_test() -> Store = [hb_test_utils:test_store()], ?event(debug_test, {store, Store}), Opts = #{ - <<"arweave-index-store">> => Store + <<"index-store">> => Store }, ID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, - Offset = 363524457275639, - Length = 8387, - ok = write_offset(Store, ID, true, Offset, Length), + EndOffset = 363524457284025, + Size = 8387, + ok = write_offset(Opts, ID, true, EndOffset, Size), {ok, Bundle} = read(Opts, ID), ?assert(hb_message:verify(Bundle, all, #{})), {ok, Child} = @@ -134,3 +147,5 @@ write_read_tx_test() -> }, ?assert(hb_message:match(ExpectedChild, Child, only_present)), ok. + +%% XXX TODO: write/read for data item \ No newline at end of file diff --git a/src/include/ar.hrl b/src/include/ar.hrl index 6406b6f51..51c90feb3 100644 --- a/src/include/ar.hrl +++ b/src/include/ar.hrl @@ -113,3 +113,8 @@ -define(BUNDLE_KEYS, [ <<"bundle-format">>, <<"bundle-version">>, <<"bundle-map">>]). + +%% The threshold was determined on the mainnet at the 2.5 fork block. The chunks +%% submitted after the threshold must adhere to stricter validation rules. +%% This offset is about half way through partition 8 +-define(STRICT_DATA_SPLIT_THRESHOLD, 30_607_159_107_830). From 512fd166948543a750ca9756853d1dece4b2edeb Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 23 Jan 2026 16:08:30 -0500 Subject: [PATCH 08/29] fix: chunk up L1 tx data according to the arwave-js logic --- src/ar_format.erl | 4 + src/ar_tx.erl | 68 ++++++++++- src/dev_arweave.erl | 280 +++++++++++++++++++++++++++++++------------- 3 files changed, 262 insertions(+), 90 deletions(-) diff --git a/src/ar_format.erl b/src/ar_format.erl index e32e18eb5..c08a99c91 100644 --- a/src/ar_format.erl +++ b/src/ar_format.erl @@ -125,6 +125,7 @@ format_fields(TX, Indent) -> format_anchor(TX, Indent) ++ format_quantity(TX, Indent) ++ format_reward(TX, Indent) ++ + format_data_size(TX, Indent) ++ format_data_root(TX, Indent). format_format(TX, Indent) -> @@ -152,6 +153,9 @@ format_quantity(TX, Indent) -> format_reward(TX, Indent) -> format_line("Reward: ~p", [TX#tx.reward], Indent + 1). +format_data_size(TX, Indent) -> + format_line("Data Size: ~p", [TX#tx.data_size], Indent + 1). + format_data_root(TX, Indent) -> format_line("Data Root: ~s", [ case TX#tx.data_root of diff --git a/src/ar_tx.erl b/src/ar_tx.erl index aa3ae8bc1..079a2044d 100644 --- a/src/ar_tx.erl +++ b/src/ar_tx.erl @@ -12,6 +12,9 @@ -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). +%% Minimum chunk size targeted by the arweave-js chuking algorithm. +-define(MIN_CHUNK_SIZE, (32 * 1024)). + %%%=================================================================== %%% Public interface. %%%=================================================================== @@ -90,7 +93,10 @@ get_owner_address(#tx{ owner_address = OwnerAddress }) -> OwnerAddress. data_root(Bin) -> - Chunks = chunk_binary(?DATA_CHUNK_SIZE, Bin), + data_root(arweavejs, Bin). + +data_root(Mode, Bin) -> + Chunks = chunk_binary(Mode, ?DATA_CHUNK_SIZE, Bin), SizeTaggedChunks = chunks_to_size_tagged_chunks(Chunks), SizeTaggedChunkIDs = sized_chunks_to_sized_chunk_ids(SizeTaggedChunks), {Root, _} = ar_merkle:generate_tree(SizeTaggedChunkIDs), @@ -213,8 +219,8 @@ verify_hash(#tx{ id = ID } = TX) -> %% check to verify that `data_root`, `data_size`, and `data` are consistent. verify_v2_data(#tx{ format = 2, data = ?DEFAULT_DATA }) -> true; -verify_v2_data(#tx{ - format = 2, data_root = DataRoot, +verify_v2_data(#tx{ + format = 2, data_root = DataRoot, data_size = DataSize, data = Data }) -> (DataSize == byte_size(Data)) andalso (DataRoot == data_root(Data)); verify_v2_data(_) -> @@ -380,11 +386,36 @@ generate_chunk_id(Chunk) -> %% @doc Split the binary into chunks. Used for computing the Merkle roots of %% v1 transactions' data and computing Merkle proofs for v2 transactions' when %% their data is uploaded without proofs. -chunk_binary(ChunkSize, Bin) when byte_size(Bin) < ChunkSize -> - [Bin]; chunk_binary(ChunkSize, Bin) -> + chunk_binary(arweavejs, ChunkSize, Bin). + +%% @doc Split the binary into chunks using the requested mode. +%% legacy: fixed-size chunking with a smaller final chunk. +%% arweavejs: size-balanced chunking where the last two chunks may be small. +%% This is the chunking logic used by the arweave-js library. +%% Adapted from: https://github.com/ArweaveTeam/arweave-js/blob/39d8ef2799a2c555e6f9b0cc6adabd7cbc411bc8/src/common/lib/merkle.ts#L43 +chunk_binary(legacy, ChunkSize, Bin) when byte_size(Bin) < ChunkSize -> + [Bin]; +chunk_binary(legacy, ChunkSize, Bin) -> <> = Bin, - [ChunkBin | chunk_binary(ChunkSize, Rest)]. + [ChunkBin | chunk_binary(legacy, ChunkSize, Rest)]; +chunk_binary(arweavejs, ChunkSize, Bin) -> + chunk_binary_arweavejs(arweavejs, ChunkSize, Bin, []). + +chunk_binary_arweavejs(arweavejs, ChunkSize, Bin, Acc) + when byte_size(Bin) >= ChunkSize -> + BinSize = byte_size(Bin), + NextChunkSize = BinSize - ChunkSize, + ChunkSize2 = + case NextChunkSize > 0 andalso NextChunkSize < ?MIN_CHUNK_SIZE of + true -> + (BinSize + 1) div 2; + false -> ChunkSize + end, + <> = Bin, + chunk_binary_arweavejs(arweavejs, ChunkSize, Rest, [Chunk | Acc]); +chunk_binary_arweavejs(arweavejs, _ChunkSize, Bin, Acc) -> + lists:reverse([Bin | Acc]). %% @doc Assign a byte offset to every chunk in the list. chunks_to_size_tagged_chunks(Chunks) -> @@ -438,6 +469,31 @@ new(Data, Reward) -> data_size = byte_size(Data) }. +chunk_binary_legacy_test() -> + ChunkSize = 10, + Data = binary:copy(<<"a">>, 25), + ChunksLegacy = chunk_binary(legacy, ChunkSize, Data), + ?assertEqual([10, 10, 5], [byte_size(Chunk) || Chunk <- ChunksLegacy]), + ?assertEqual(ChunksLegacy, chunk_binary(legacy, ChunkSize, Data)). + +chunk_binary_arweavejs_balanced_test() -> + ChunkSize = ?DATA_CHUNK_SIZE, + MinChunkSize = ?MIN_CHUNK_SIZE, + DataSize = ChunkSize + MinChunkSize - 1, + Data = binary:copy(<<"b">>, DataSize), + Chunks = chunk_binary(arweavejs, ChunkSize, Data), + ExpectedFirst = (DataSize + 1) div 2, + ExpectedSecond = DataSize - ExpectedFirst, + ?assertEqual([ExpectedFirst, ExpectedSecond], [byte_size(Chunk) || Chunk <- Chunks]). + +chunk_binary_arweavejs_standard_test() -> + ChunkSize = ?DATA_CHUNK_SIZE, + MinChunkSize = ?MIN_CHUNK_SIZE, + DataSize = ChunkSize + MinChunkSize, + Data = binary:copy(<<"c">>, DataSize), + Chunks = chunk_binary(arweavejs, ChunkSize, Data), + ?assertEqual([ChunkSize, MinChunkSize], [byte_size(Chunk) || Chunk <- Chunks]). + sign_tx_test_() -> {timeout, 30, fun test_sign_tx/0}. diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index c7fe07d1b..0c2159eab 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -150,40 +150,50 @@ post_chunk(_Base, Request, Opts) -> get_chunk_range(_Base, Request, Opts) -> Offset = hb_util:int(hb_ao:get(<<"offset">>, Request, Opts)), - % get_chunk_range will attempt to query Length *or more* bytes, it will - % accumulate all the bytes that are returned by the gateway which in - % some cases will exceed the requested length. - Length = hb_util:int(hb_ao:get(<<"length">>, Request, ?DATA_CHUNK_SIZE, Opts)), + % Default to 1 to ensure a single chunk is read. + Length = hb_util:int(hb_ao:get(<<"length">>, Request, 1, Opts)), case get_chunk_range(Offset, Length, Opts, [], 0) of {ok, Chunks} -> Data = iolist_to_binary(Chunks), - StartOffset = ar_block:get_chunk_padded_offset(Offset) - ?DATA_CHUNK_SIZE + 1, - StartGap = Offset - StartOffset, - TruncatedLength = min(Length, byte_size(Data) - StartGap), - ?event(debug_test, { - get_chunk_range, - {offset, Offset}, {length, Length}, {data_size, byte_size(Data)}, - {start_offset, StartOffset}, {start_gap, StartGap}, - {truncated_length, TruncatedLength} - }), - Truncated = binary:part(Data, StartGap, TruncatedLength), - {ok, Truncated}; + % When no `length` is specified, we'll read only a single chunk. + % If `length` is specified, we'll read until we've accumulated + % `length` or more bytes, and then truncate down to `length` bytes. + case hb_maps:is_key(<<"length">>, Request, Opts) of + true -> + {ok, binary:part(Data, 0, Length)}; + false -> + {ok, Data} + end; {error, Reason} -> {error, Reason} end. get_chunk_range(_Offset, Length, _Opts, Chunks, Size) when Size >= Length -> + ?event(debug_test, {end_get_chunk_range, {offset, _Offset}, {length, Length}, {size, Size}}), {ok, lists:reverse(Chunks)}; get_chunk_range(Offset, Length, Opts, Chunks, Size) -> + ?event(debug_test, {get_chunk_range, {offset, Offset}, {length, Length}, {size, Size}}), case get_chunk(Offset, Opts) of - {ok, Chunk} -> + {ok, JSON} -> + Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), + ChunkSize = byte_size(Chunk), + AbsoluteEndOffset = hb_util:int(maps:get(<<"absolute_end_offset">>, JSON)), + AbsoluteStartOffset = AbsoluteEndOffset - ChunkSize + 1, + StartGap = Offset - AbsoluteStartOffset, + TruncatedLength = ChunkSize - StartGap, + ?event(debug_test, { + {absolute_end_offset, AbsoluteEndOffset}, + {absolute_start_offset, AbsoluteStartOffset}, + {chunk_size, ChunkSize}, {offset, Offset}, + {chunk_hash, hb_util:encode(crypto:hash(sha256, Chunk))}}), + SlicedChunk = binary:part(Chunk, StartGap, TruncatedLength), get_chunk_range( - Offset + ?DATA_CHUNK_SIZE, + AbsoluteEndOffset + 1, Length, Opts, - [Chunk | Chunks], - Size + ?DATA_CHUNK_SIZE + [SlicedChunk | Chunks], + Size + byte_size(SlicedChunk) ); {error, Reason} -> {error, Reason} @@ -191,15 +201,7 @@ get_chunk_range(Offset, Length, Opts, Chunks, Size) -> get_chunk(Offset, Opts) -> Path = <<"/chunk/", (hb_util:bin(Offset))/binary>>, - Res = request(<<"GET">>, Path, Opts), - ?event(debug_test, {{path, Path}, {get_chunk_response, Res}}), - case Res of - {ok, JSON } -> - Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), - {ok, Chunk}; - {error, Reason} -> - {error, Reason} - end. + request(<<"GET">>, Path, Opts). %% @doc Retrieve (and cache) block information from Arweave. If the `block' key %% is present, it is used to look up the associated block. If it is of Arweave @@ -503,6 +505,36 @@ get_tx_basic_data_test() -> ?assert(hb_message:match(ExpectedMsg, StructuredWithHash, only_present)), ok. +%% @doc The data for this transaction ends with two smaller chunks. +get_tx_split_chunk_test() -> + {ok, Structured} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"tx">>, + <<"tx">> => <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + <<"exclude-data">> => false + }, + #{} + ), + ?assert(hb_message:verify(Structured, all, #{})), + ?assertEqual( + <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + hb_message:id(Structured, signed)), + ExpectedMsg = #{ + <<"reward">> => <<"6035386935">>, + <<"anchor">> => <<"PX16-598IrIMvLxFkvfNTWLVKXqXSmArOdW3o7X8jWMCH1fiNOjBZ2XjQlw0FOme">>, + <<"Contract">> => <<"KTzTXT_ANmF84fWEKHzWURD1LWd9QaFR9yfYUwH2Lxw">> + }, + ?assert(hb_message:match(ExpectedMsg, Structured, only_present)), + + Child = hb_ao:get(<<"1/2">>, Structured), + ?assert(hb_message:verify(Child, all, #{})), + ?event(debug_test, {child, {explicit, hb_message:id(Child, signed)}}), + ?assertEqual( + <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, + hb_message:id(Child, signed)), + ok. + get_tx_basic_data_exclude_data_test() -> {ok, Structured} = hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, @@ -618,62 +650,53 @@ serialize_data_item_test_disabled() -> ?assert(ar_bundles:verify_item(VerifiedItem)), ok. -%% Note: the following tests use a mock gateway since https://arweave.net/chunk -%% is not reliable. The current implementation has it delegating to a random -%% backing node, and depending on which packing node is selected the request -%% may return valid data or 404. To make the tests reliable, we instead use -%% a mock server. -get_partial_chunk_test() -> - Offset = 377813969707255, +get_partial_chunk_post_split_test() -> + %% https://arweave.net/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset + %% + Offset = 378092137521399, ExpectedLength = 1000, - Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), - ChunkData = #{ Offset => Chunk0 }, - {ServerHandle, Opts} = start_mock_gateway(ChunkData), - try - ExpectedData = binary:part(Chunk0, 0, ExpectedLength), - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - Opts - ), - ?assertEqual(?DATA_CHUNK_SIZE, byte_size(Data)), - ?assertEqual(ExpectedData, binary:part(Data, 0, ExpectedLength)), - ok - after - hb_mock_server:stop(ServerHandle) - end. + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"G62E7qonT1RBmkC6e3pNJz_thpS9xkVD3qTJAk6o3Uc">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. -get_full_chunk_test() -> - Offset = 377813969707255, +get_full_chunk_post_split_test() -> + %% https://arweave.net/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset + %% + Offset = 378092137521399, ExpectedLength = ?DATA_CHUNK_SIZE, - Chunk0 = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), - ChunkData = #{ Offset => Chunk0 }, - {ServerHandle, Opts} = start_mock_gateway(ChunkData), - try - {ok, Data} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => Offset, - <<"length">> => ExpectedLength - }, - Opts - ), - ?assertEqual(Chunk0, Data), - ok - after - hb_mock_server:stop(ServerHandle) - end. + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"LyTBdUe0rNmpqt8C-p7HksdiredXaa0wCBAPt3504W0">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. -get_multi_chunk_test() -> - %% http://tip-1.arweave.xyz:1984/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset +get_multi_chunk_post_split_test() -> + %% https://arweave.net/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset %% Offset = 378092137521399, - ExpectedLength = 1264430, + ExpectedLength = ?DATA_CHUNK_SIZE * 3, Opts = #{}, {ok, Data} = hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, @@ -684,13 +707,16 @@ get_multi_chunk_test() -> }, Opts ), - ?event(debug_test, {{data, byte_size(Data)}, {expected_length, ExpectedLength}}), - % ?assertEqual(ExpectedData, Data), + ?assertEqual( + <<"4Cb_N0z0tMDwCiWrUbuzktfn-H6NLHT1btXGDo3CByI">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), ok. + %% @doc Query a chunk range that starts and ends in the middle of a chunk. -get_mid_chunk_test() -> - %% http://tip-1.arweave.xyz:1984/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset +get_mid_chunk_post_split_test() -> + %% https://arweave.net/tx/QL7_EnmrFtx-0wVgPr2IwaGWQT8vmPcF3R20CKMO3D4/offset %% Offset = 378092137521399 + 200_000, ExpectedLength = ?DATA_CHUNK_SIZE + 300_000, @@ -704,6 +730,92 @@ get_mid_chunk_test() -> }, Opts ), - ?event(debug_test, {{data, byte_size(Data)}, {expected_length, ExpectedLength}}), - % ?assertEqual(ExpectedData, Data), + ?assertEqual( + <<"xkEZpGqDiCVuVZfGVyscmfYNZqYmgBLjOrMD2P_SfWs">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. + +get_partial_chunk_pre_split_test() -> + %% https://arweave.net/tx/v4ophPvV-cNp5gkpkjMuUZ-lf-fBfm1Wk-pB4vJb00E/offset + %% + Offset = 30575701172109, + ExpectedLength = 1000, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"yU5tZyDCTZ4MFcT6lng74tvx1oIbPkpCw1VAJsSqeuo">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. + +get_full_chunk_pre_split_test() -> + %% https://arweave.net/tx/v4ophPvV-cNp5gkpkjMuUZ-lf-fBfm1Wk-pB4vJb00E/offset + %% + Offset = 30575701172109, + ExpectedLength = ?DATA_CHUNK_SIZE, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"nVCvjEq9T5nxIR6jvglNbX1_CYCg0WifxfQoXhS4gik">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. + +get_multi_chunk_pre_split_test() -> + %% https://arweave.net/tx/v4ophPvV-cNp5gkpkjMuUZ-lf-fBfm1Wk-pB4vJb00E/offset + %% + Offset = 30575701172109, + ExpectedLength = ?DATA_CHUNK_SIZE * 3, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"DfS3jtLXqG3zO_IFA3P-r55SUBoeJmeIh4Eim2Rldeo">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), + ok. + +get_mid_chunk_pre_split_test() -> + %% https://arweave.net/tx/v4ophPvV-cNp5gkpkjMuUZ-lf-fBfm1Wk-pB4vJb00E/offset + %% + Offset = 30575701172109 + 200_000, + ExpectedLength = ?DATA_CHUNK_SIZE + 300_000, + Opts = #{}, + {ok, Data} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => ExpectedLength + }, + Opts + ), + ?assertEqual( + <<"mgSfqsNapn_BXpbnIHtdeu3rQyvrjBaS0c7rEbUbtBU">>, + hb_util:encode(crypto:hash(sha256, Data)) + ), ok. \ No newline at end of file From 6ce7623132d0827f3bba735dda54ab5cb2e209ce Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 23 Jan 2026 16:09:42 -0500 Subject: [PATCH 09/29] fix: use neo-arweave.zephyrdev.xyz as the gateway for GET /chunk requests neo-arweave has a roundrobin scheme where it will try several nodes looking for a chunk. arweave.net delegates to a single node regardless of whether or not it has the chunk - this can yield unreliable results (same query sometimes returns data sometimes 404s) --- src/hb_opts.erl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/hb_opts.erl b/src/hb_opts.erl index 60e665a04..b334bb7e5 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -287,6 +287,16 @@ default_message() -> } ] }, + #{ + % Routes for Arweave transaction requests to use a remote gateway. + <<"template">> => <<"/chunk">>, + <<"node">> => + #{ + <<"match">> => <<"^/arweave">>, + <<"with">> => <<"https://neo-arweave.zephyrdev.xyz">>, + <<"opts">> => #{ http_client => httpc, protocol => http2 } + } + }, #{ % Routes for Arweave transaction requests to use a remote gateway. <<"template">> => <<"/arweave">>, From c629125f60a1f3fc684d1d266004fcd9f7598071 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 23 Jan 2026 16:51:26 -0500 Subject: [PATCH 10/29] fix: use legacy chunking mode for format=1 transactions --- src/ar_tx.erl | 15 ++++++++++++--- src/dev_arweave_common.erl | 5 ++++- src/dev_bundler_dispatch.erl | 3 ++- src/dev_codec_tx.erl | 6 +++--- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/ar_tx.erl b/src/ar_tx.erl index 079a2044d..5fc7b03b0 100644 --- a/src/ar_tx.erl +++ b/src/ar_tx.erl @@ -2,11 +2,12 @@ -module(ar_tx). -export([sign/2, verify/1, verify_tx_id/2]). --export([id/1, id/2, get_owner_address/1, data_root/1]). +-export([id/1, id/2, get_owner_address/1, data_root/1, data_root/2]). -export([generate_signature_data_segment/1, generate_chunk_id/1]). -export([json_struct_to_tx/1, tx_to_json_struct/1]). -export([generate_chunk_tree/1, generate_chunk_tree/2]). --export([chunk_binary/2, chunks_to_size_tagged_chunks/1, sized_chunks_to_sized_chunk_ids/1]). +-export([chunk_binary/2, chunk_binary/3, chunking_mode/1]). +-export([chunks_to_size_tagged_chunks/1, sized_chunks_to_sized_chunk_ids/1]). -export([get_weave_size_increase/2]). -include("include/hb.hrl"). @@ -367,10 +368,11 @@ tx_to_json_struct( %% Used to compute the Merkle roots of v1 transactions' data and to compute %% Merkle proofs for v2 transactions when their data is uploaded without proofs. generate_chunk_tree(TX) -> + Mode = chunking_mode(TX#tx.format), generate_chunk_tree(TX, sized_chunks_to_sized_chunk_ids( chunks_to_size_tagged_chunks( - chunk_binary(?DATA_CHUNK_SIZE, TX#tx.data) + chunk_binary(Mode, ?DATA_CHUNK_SIZE, TX#tx.data) ) ) ). @@ -389,6 +391,13 @@ generate_chunk_id(Chunk) -> chunk_binary(ChunkSize, Bin) -> chunk_binary(arweavejs, ChunkSize, Bin). +chunking_mode(1) -> + legacy; +chunking_mode(2) -> + arweavejs; +chunking_mode(_) -> + legacy. + %% @doc Split the binary into chunks using the requested mode. %% legacy: fixed-size chunking with a smaller final chunk. %% arweavejs: size-balanced chunking where the last two chunks may be small. diff --git a/src/dev_arweave_common.erl b/src/dev_arweave_common.erl index 7ae54889d..1e984ec20 100644 --- a/src/dev_arweave_common.erl +++ b/src/dev_arweave_common.erl @@ -189,9 +189,12 @@ reset_owner_address(TX) -> TX#tx{owner_address = ar_tx:get_owner_address(TX)}. +normalize_data_root(Item = #tx{data = Bin, format = 1}) + when is_binary(Bin) andalso Bin =/= ?DEFAULT_DATA -> + Item#tx{data_root = ar_tx:data_root(legacy, Bin)}; normalize_data_root(Item = #tx{data = Bin, format = 2}) when is_binary(Bin) andalso Bin =/= ?DEFAULT_DATA -> - Item#tx{data_root = ar_tx:data_root(Bin)}; + Item#tx{data_root = ar_tx:data_root(arweavejs, Bin)}; normalize_data_root(Item) -> Item. diff --git a/src/dev_bundler_dispatch.erl b/src/dev_bundler_dispatch.erl index 078067827..d1f290566 100644 --- a/src/dev_bundler_dispatch.erl +++ b/src/dev_bundler_dispatch.erl @@ -453,7 +453,8 @@ execute_task(#task{type = build_proofs, data = CommittedTX, opts = Opts} = Task) Data = TX#tx.data, DataRoot = TX#tx.data_root, DataSize = TX#tx.data_size, - Chunks = ar_tx:chunk_binary(?DATA_CHUNK_SIZE, Data), + Mode = ar_tx:chunking_mode(TX#tx.format), + Chunks = ar_tx:chunk_binary(Mode, ?DATA_CHUNK_SIZE, Data), SizeTaggedChunks = ar_tx:chunks_to_size_tagged_chunks(Chunks), SizeTaggedChunkIDs = ar_tx:sized_chunks_to_sized_chunk_ids(SizeTaggedChunks), {_Root, DataTree} = ar_merkle:generate_tree(SizeTaggedChunkIDs), diff --git a/src/dev_codec_tx.erl b/src/dev_codec_tx.erl index 1a0cf51c5..d68f76681 100644 --- a/src/dev_codec_tx.erl +++ b/src/dev_codec_tx.erl @@ -343,7 +343,7 @@ happy_tx_test() -> quantity = 1000, data = Data, data_size = byte_size(Data), - data_root = ar_tx:data_root(Data), + data_root = ar_tx:data_root(arweavejs, Data), reward = 2000 }, UnsignedTABM = #{ @@ -375,7 +375,7 @@ data_header_but_no_data_test() -> Anchor = crypto:strong_rand_bytes(32), Target = crypto:strong_rand_bytes(32), Data = <<"test-data">>, - DataRoot = ar_tx:data_root(Data), + DataRoot = ar_tx:data_root(arweavejs, Data), DataSize = byte_size(Data), UnsignedTX = #tx{ format = 2, @@ -556,7 +556,7 @@ ao_data_key_test() -> ], data = Data, data_size = byte_size(Data), - data_root = ar_tx:data_root(Data) + data_root = ar_tx:data_root(arweavejs, Data) }, SignedCommitment = #{ <<"commitment-device">> => <<"tx@1.0">>, From bfce13b9b05a42f9d47c109e30d6bab2428f0745 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Sat, 24 Jan 2026 21:49:33 -0500 Subject: [PATCH 11/29] fix: add full block index test to dev_copycat_arweave --- src/dev_arweave.erl | 38 --------------- src/dev_copycat_arweave.erl | 96 ++++++++++++++++++++++++++++--------- src/hb_store_arweave.erl | 12 +++-- 3 files changed, 82 insertions(+), 64 deletions(-) diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index 0c2159eab..2aefada0b 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -170,10 +170,8 @@ get_chunk_range(_Base, Request, Opts) -> get_chunk_range(_Offset, Length, _Opts, Chunks, Size) when Size >= Length -> - ?event(debug_test, {end_get_chunk_range, {offset, _Offset}, {length, Length}, {size, Size}}), {ok, lists:reverse(Chunks)}; get_chunk_range(Offset, Length, Opts, Chunks, Size) -> - ?event(debug_test, {get_chunk_range, {offset, Offset}, {length, Length}, {size, Size}}), case get_chunk(Offset, Opts) of {ok, JSON} -> Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), @@ -182,11 +180,6 @@ get_chunk_range(Offset, Length, Opts, Chunks, Size) -> AbsoluteStartOffset = AbsoluteEndOffset - ChunkSize + 1, StartGap = Offset - AbsoluteStartOffset, TruncatedLength = ChunkSize - StartGap, - ?event(debug_test, { - {absolute_end_offset, AbsoluteEndOffset}, - {absolute_start_offset, AbsoluteStartOffset}, - {chunk_size, ChunkSize}, {offset, Offset}, - {chunk_hash, hb_util:encode(crypto:hash(sha256, Chunk))}}), SlicedChunk = binary:part(Chunk, StartGap, TruncatedLength), get_chunk_range( AbsoluteEndOffset + 1, @@ -402,37 +395,6 @@ to_message(Path, {ok, #{ <<"body">> := Body }}, Opts) -> %%% Tests -start_mock_gateway(ChunkData) -> - Endpoints = [ - {"/chunk/:offset", chunk, - fun(Req) -> - Path = maps:get(<<"path">>, Req, <<>>), - Segments = binary:split(Path, <<"/">>, [global, trim_all]), - OffsetValue = hb_util:int(lists:last(Segments)), - case maps:get(OffsetValue, ChunkData, undefined) of - undefined -> - {404, <<"not found">>}; - Data -> - Body = hb_json:encode(#{ <<"chunk">> => hb_util:encode(Data) }), - {200, Body} - end - end} - ], - {ok, MockServer, ServerHandle} = hb_mock_server:start(Endpoints), - Opts = #{ - routes => [ - #{ - <<"template">> => <<"/arweave">>, - <<"node">> => #{ - <<"match">> => <<"^/arweave">>, - <<"with">> => MockServer, - <<"opts">> => #{ http_client => httpc, protocol => http2 } - } - } - ] - }, - {ServerHandle, Opts}. - post_ans104_tx_test() -> ServerOpts = #{ store => [hb_test_utils:test_store()] }, Server = hb_http_server:start_node(ServerOpts), diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index f64e85fd9..4fa8a49e0 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -212,28 +212,80 @@ index_ids_test() -> ), ?event(debug_test, {Block}), ?assertEqual(ok, maybe_index_ids(Block, Opts)), + % WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4 is a bundle signed with + % an Ethereum signature which is not supported by HB as of Jan 2026. + % The bundle should be indexed even though we can't deserialized the + % bundle itself. + ?assertException( + error, + {badmatch, unsupported_tx_format}, + hb_store_arweave:read( + StoreOpts, + <<"WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4">>) + ), + % These 3 items are within the WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4 + % bundle. + assert_item_read(StoreOpts, + <<"ATi9pQF_eqb99UK84R5rq8lGfRGpilVQOYyth7rXxh8">>), + assert_item_read(StoreOpts, + <<"4VSfUbhMVZQHW5VfVwQZOmC5fR3W21DZgFCyz8CA-cE">>), + assert_item_read(StoreOpts, + <<"ZQRHZhktk6dAtX9BlhO1teOtVlGHoyaWP25kAlhxrM4">>), + % The T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs can be deserialized so + % we'll verify that some of its items were index and match the version + % in the deserialized bundle. + assert_bundle_read( + StoreOpts, + <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + [ + {<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, <<"1">>}, + {<<"MgatoEjlO_YtdbxFi9Q7Hxbs0YQVcChddhSS7FsdeIg">>, <<"19">>}, + {<<"z-oKJfhMq5qoVFrljEfiBKgumaJmCWVxNJaavR5aPE8">>, <<"26">>} + ] + ), + % Non-ans104 transaction in the block should not be indexed. + ?assertEqual({error, not_found}, + hb_store_arweave:read(StoreOpts, + <<"bXEgFm4K2b5VD64skBNAlS3I__4qxlM3Sm4Z5IXj3h8">>)), + % Another bundle with an unsupported signature should be indexed even if + % it can't be deserialized. + ?assertException( + error, + {badmatch, unsupported_tx_format}, + hb_store_arweave:read( + StoreOpts, + <<"kK67S13W_8jM9JUw2umVamo0zh9v1DeVxWrru2evNco">>) + ), + assert_bundle_read( + StoreOpts, + <<"c2ATDuTgwKCcHpAFZqSt13NC-tA4hdA7Aa2xBPuOzoE">>, + [ + {<<"OBKr-7UrmjxFD-h-qP-XLuvCgtyuO_IDpBMgIytvusA">>, <<"1">>} + ] + ), + ok. - % Bundles with unsupprted signatures should still be indexed, but when - % we go to desiarlize the data it will fail. - % ?assertEqual({badmatch, unsupported_tx_format}, - % hb_store_arweave:read( - % StoreOpts, - % <<"kK67S13W_8jM9JUw2umVamo0zh9v1DeVxWrru2evNco">>)), - - % This is a single item bundle with nested items. The immediately - % bundled item is indexed, the nested items are not. - {ok, TX5} = hb_store_arweave:read(StoreOpts, <<"c2ATDuTgwKCcHpAFZqSt13NC-tA4hdA7Aa2xBPuOzoE">>), - ?assertEqual(<<"871231847">>, hb_maps:get(<<"reward">>, TX5, #{})), - ?assert(hb_message:verify(TX5, all, #{})), - TX5Item = hb_ao:get(<<"1">>, TX5, #{}), - TX5ItemID = hb_message:id(TX5Item, signed), - ?assertEqual(<<"OBKr-7UrmjxFD-h-qP-XLuvCgtyuO_IDpBMgIytvusA">>, TX5ItemID), - TX5ItemRead = hb_store_arweave:read(StoreOpts, <<"XJq09oboLC7Z5LQ0lsg2klHa3pkCW_H1kWeBnrMLmfc">>), - ?event(debug_test, {{id, TX5ItemID}, {read, TX5ItemRead}}), - ?assert(hb_message:verify(TX5ItemRead, all, #{})), - ?assertEqual(TX5Item, TX5ItemRead), +assert_bundle_read(StoreOpts, BundleID, ExpectedItems) -> + ReadItems = + lists:map( + fun({ItemID, _Index}) -> + assert_item_read(StoreOpts, ItemID) + end, + ExpectedItems + ), + Bundle = assert_item_read(StoreOpts, BundleID), + lists:foreach( + fun({{_ItemID, Index}, Item}) -> + QueriedItem = hb_ao:get(Index, Bundle, #{}), + ?assertEqual(hb_maps:without(?AO_CORE_KEYS, Item), hb_maps:without(?AO_CORE_KEYS, QueriedItem)) + end, + lists:zip(ExpectedItems, ReadItems) + ), + ok. +assert_item_read(StoreOpts, ItemID) -> + {ok, Item} = hb_store_arweave:read(StoreOpts, ItemID), + ?assert(hb_message:verify(Item, all, #{})), + ?assertEqual(ItemID, hb_message:id(Item, signed)), + Item. - % ?assert(hb_message:verify(TX5Item, all, #{})), - % {error not_found} = hb_store_arweave:read(StoreOpts, hb_message:id(TX5Item, signed)), - ok. \ No newline at end of file diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index a3904dc04..fada7dc19 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -19,8 +19,12 @@ read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> [IsTX, EndOffset, Size] = binary:split(Binary, <<":">>, [global]), ?event( debug_test, - {reading_offset, {is_tx, IsTX}, - {end_offset, EndOffset}, {size, Size}} + {reading_offset, + {path, path(ID)}, + {is_tx, IsTX}, + {end_offset, EndOffset}, + {size, Size} + } ), case hb_util:bool(IsTX) of true -> @@ -30,7 +34,7 @@ read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> load_item( hb_util:int(EndOffset), hb_util:int(Size), StoreOpts) end; - {error, not_found} -> + not_found -> {error, not_found} end. @@ -102,7 +106,7 @@ write_offset(#{ <<"index-store">> := IndexStore }, ID, IsTX, EndOffset, Size) -> ":", (hb_util:bin(Size))/binary >>, - ?event(debug_test, {value, Value}), + ?event(debug_test, {{path, path(ID)}, {value, Value}}), hb_store:write(IndexStore, path(ID), Value). path(ID) -> From 6fb2ce0619e7fb60eff7f11a4bece7467def5116 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Sun, 25 Jan 2026 21:02:08 -0500 Subject: [PATCH 12/29] test: update dev_copycat_arweave test to use the regular device entrypoint --- src/dev_copycat_arweave.erl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 4fa8a49e0..932cd0728 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -62,7 +62,7 @@ fetch_blocks(Req, Current, To, Opts) -> process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of {ok, Block} -> - % maybe_index_ids(Block, Opts), + maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, @@ -203,15 +203,15 @@ index_ids_test() -> arweave_index_ids => true, arweave_index_store => StoreOpts }, - {ok, Block} = hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/block=5Ya_2_jLshzNodGRVrBwlAhcEowIFgVbvOcN4j_MJI4QfodQ7Nd8ke7CMN9OnpK0" - >>, + {ok, 1827941} = hb_ao:resolve( + #{ <<"device">> => <<"copycat@1.0">> }, + #{ + <<"path">> => <<"arweave">>, + <<"from">> => 1827942, + <<"to">> => 1827941 + }, Opts ), - ?event(debug_test, {Block}), - ?assertEqual(ok, maybe_index_ids(Block, Opts)), % WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4 is a bundle signed with % an Ethereum signature which is not supported by HB as of Jan 2026. % The bundle should be indexed even though we can't deserialized the From 73b4fadf107174a60fd6e986c47ec1613500677c Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 16:00:08 -0500 Subject: [PATCH 13/29] fix: enable hb_ao:resolve to work for indexed arweave items Main change was implementing hb_store_arweave:type/2 --- src/dev_copycat_arweave.erl | 73 +++++++++++++++++++++---------------- src/hb_opts.erl | 4 +- src/hb_store_arweave.erl | 28 ++++++++------ 3 files changed, 61 insertions(+), 44 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 932cd0728..24c53da94 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -91,11 +91,6 @@ maybe_index_ids(Block, Opts) -> BlockSize = hb_util:int( hb_maps:get(<<"block_size">>, Block, 0, Opts)), BlockStartOffset = BlockEndOffset - BlockSize, - ?event(debug_test, { - {block_end_offset, BlockEndOffset}, - {block_size, BlockSize}, - {block_start_offset, BlockStartOffset} - }), TXs = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), @@ -103,12 +98,6 @@ maybe_index_ids(Block, Opts) -> ({{padding, _PaddingRoot}, _EndOffset}) -> ok; ({{TX, _TXDataRoot}, EndOffset}) -> - ?event(debug_test, { - {tx, TX}, - {end_offset, BlockStartOffset + EndOffset}, - {tx_size, TX#tx.data_size}, - {is_bundle_tx, is_bundle_tx(TX, Opts)} - }), case is_bundle_tx(TX, Opts) of false -> ok; true -> @@ -123,15 +112,9 @@ maybe_index_ids(Block, Opts) -> ), {ok, {BundleIndex, HeaderSize}} = download_bundle_header( TXEndOffset, TX#tx.data_size, Opts), - ?event(debug_test, {{bundle_index, BundleIndex}, {header_size, HeaderSize}}), lists:foldl( fun({ItemID, Size}, OffsetAcc) -> ItemEndOffset = OffsetAcc + Size, - ?event(debug_test, { - {item_id, {explicit, hb_util:encode(ItemID)}}, - {item_end_offset, ItemEndOffset}, - {size, Size} - }), hb_store_arweave:write_offset( IndexStore, hb_util:encode(ItemID), @@ -197,9 +180,37 @@ index_ids_test() -> %% signature type is not yet (as of Jan 2026) supported by ar_bundles.erl, %% however we should still be able to index it (we just can't deserialize %% it). - Store = [hb_test_utils:test_store()], - StoreOpts = #{ <<"index-store">> => Store }, + TestStore = hb_test_utils:test_store(), + StoreOpts = #{ <<"index-store">> => [TestStore] }, + Store = [ + TestStore, + #{ + <<"store-module">> => hb_store_fs, + <<"name">> => <<"cache-mainnet">> + }, + #{ + <<"store-module">> => hb_store_arweave, + <<"name">> => <<"cache-arweave">>, + <<"index-store">> => [TestStore], + <<"arweave-node">> => <<"https://arweave.net">> + }, + #{ + <<"store-module">> => hb_store_gateway, + <<"subindex">> => [ + #{ + <<"name">> => <<"Data-Protocol">>, + <<"value">> => <<"ao">> + } + ], + <<"local-store">> => [TestStore] + }, + #{ + <<"store-module">> => hb_store_gateway, + <<"local-store">> => [TestStore] + } + ], Opts = #{ + store => Store, arweave_index_ids => true, arweave_index_store => StoreOpts }, @@ -225,17 +236,17 @@ index_ids_test() -> ), % These 3 items are within the WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4 % bundle. - assert_item_read(StoreOpts, + assert_item_read(Opts, <<"ATi9pQF_eqb99UK84R5rq8lGfRGpilVQOYyth7rXxh8">>), - assert_item_read(StoreOpts, + assert_item_read(Opts, <<"4VSfUbhMVZQHW5VfVwQZOmC5fR3W21DZgFCyz8CA-cE">>), - assert_item_read(StoreOpts, + assert_item_read(Opts, <<"ZQRHZhktk6dAtX9BlhO1teOtVlGHoyaWP25kAlhxrM4">>), % The T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs can be deserialized so % we'll verify that some of its items were index and match the version % in the deserialized bundle. assert_bundle_read( - StoreOpts, + Opts, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, [ {<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, <<"1">>}, @@ -257,7 +268,7 @@ index_ids_test() -> <<"kK67S13W_8jM9JUw2umVamo0zh9v1DeVxWrru2evNco">>) ), assert_bundle_read( - StoreOpts, + Opts, <<"c2ATDuTgwKCcHpAFZqSt13NC-tA4hdA7Aa2xBPuOzoE">>, [ {<<"OBKr-7UrmjxFD-h-qP-XLuvCgtyuO_IDpBMgIytvusA">>, <<"1">>} @@ -265,27 +276,27 @@ index_ids_test() -> ), ok. -assert_bundle_read(StoreOpts, BundleID, ExpectedItems) -> +assert_bundle_read(Opts, BundleID, ExpectedItems) -> ReadItems = lists:map( fun({ItemID, _Index}) -> - assert_item_read(StoreOpts, ItemID) + assert_item_read(Opts, ItemID) end, ExpectedItems ), - Bundle = assert_item_read(StoreOpts, BundleID), + Bundle = assert_item_read(Opts, BundleID), lists:foreach( fun({{_ItemID, Index}, Item}) -> - QueriedItem = hb_ao:get(Index, Bundle, #{}), + QueriedItem = hb_ao:get(Index, Bundle, Opts), ?assertEqual(hb_maps:without(?AO_CORE_KEYS, Item), hb_maps:without(?AO_CORE_KEYS, QueriedItem)) end, lists:zip(ExpectedItems, ReadItems) ), ok. -assert_item_read(StoreOpts, ItemID) -> - {ok, Item} = hb_store_arweave:read(StoreOpts, ItemID), - ?assert(hb_message:verify(Item, all, #{})), +assert_item_read(Opts, ItemID) -> + {ok, Item} = hb_ao:resolve(ItemID, Opts), + ?assert(hb_message:verify(Item, all, Opts)), ?assertEqual(ItemID, hb_message:id(Item, signed)), Item. diff --git a/src/hb_opts.erl b/src/hb_opts.erl index b334bb7e5..64fc34ec2 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -403,7 +403,9 @@ default_message() -> % Options: % - fallback: Fallback to the index page % - error: Return 404 Not Found - manifest_404 => fallback + manifest_404 => fallback, + arweave_index_ids => true, + arweave_index_store => #{ <<"index-store">> => [?DEFAULT_PRIMARY_STORE] } }. %% @doc Get an option from the global options, optionally overriding with a diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index fada7dc19..a5f1c1cd8 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -2,7 +2,7 @@ %%% intermediate cache of offsets as an ID->ArweaveLocation mapping. -module(hb_store_arweave). %%% Store API: --export([read/2]). +-export([type/2, read/2]). %%% Indexing API: -export([write_offset/5]). -include("include/hb.hrl"). @@ -10,6 +10,21 @@ -define(ARWEAVE_INDEX_PATH, <<"~arweave@2.9-pre/offset">>). +%% @doc Get the type of the data at the given key. We potentially cache the +%% result, so that we don't have to read the data from the GraphQL route +%% multiple times. +type(StoreOpts, Key) -> + case read(StoreOpts, Key) of + not_found -> not_found; + {ok, _Data} -> + % TODO: + % - should this return composite for any index L1 bundles? + % - if so, I guess we need to implement list/2? + % - for now we don't index nested bundle children, but once we + % do we may nalso need to return composite for them. + simple + end. + read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> case hb_store:read(IndexStore, path(ID)) of {ok, Binary} -> @@ -17,15 +32,6 @@ read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> % Length) to preserve consistency with the Arweave API's % `/tx//offset` endpoint. [IsTX, EndOffset, Size] = binary:split(Binary, <<":">>, [global]), - ?event( - debug_test, - {reading_offset, - {path, path(ID)}, - {is_tx, IsTX}, - {end_offset, EndOffset}, - {size, Size} - } - ), case hb_util:bool(IsTX) of true -> load_bundle(ID, @@ -106,7 +112,6 @@ write_offset(#{ <<"index-store">> := IndexStore }, ID, IsTX, EndOffset, Size) -> ":", (hb_util:bin(Size))/binary >>, - ?event(debug_test, {{path, path(ID)}, {value, Value}}), hb_store:write(IndexStore, path(ID), Value). path(ID) -> @@ -121,7 +126,6 @@ path(ID) -> write_read_tx_test() -> Store = [hb_test_utils:test_store()], - ?event(debug_test, {store, Store}), Opts = #{ <<"index-store">> => Store }, From f153d81efa4258db970604913ca1c46cfea92010 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 16:21:12 -0500 Subject: [PATCH 14/29] fix: hb_store_arweave now uses start offset instead of end offset --- src/dev_copycat_arweave.erl | 9 ++++----- src/hb_store_arweave.erl | 39 +++++++++++++++---------------------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 24c53da94..5964b1220 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -107,22 +107,21 @@ maybe_index_ids(Block, Opts) -> IndexStore, hb_util:encode(TX#tx.id), true, - TXEndOffset, + TXStartOffset, TX#tx.data_size ), {ok, {BundleIndex, HeaderSize}} = download_bundle_header( TXEndOffset, TX#tx.data_size, Opts), lists:foldl( - fun({ItemID, Size}, OffsetAcc) -> - ItemEndOffset = OffsetAcc + Size, + fun({ItemID, Size}, ItemStartOffset) -> hb_store_arweave:write_offset( IndexStore, hb_util:encode(ItemID), false, - ItemEndOffset, + ItemStartOffset, Size ), - ItemEndOffset + ItemStartOffset + Size end, TXStartOffset + HeaderSize, BundleIndex diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index a5f1c1cd8..8dfbf087b 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -28,24 +28,21 @@ type(StoreOpts, Key) -> read(StoreOpts = #{ <<"index-store">> := IndexStore }, ID) -> case hb_store:read(IndexStore, path(ID)) of {ok, Binary} -> - % EndOffset and Size is recorded (rather than StartOffset and - % Length) to preserve consistency with the Arweave API's - % `/tx//offset` endpoint. - [IsTX, EndOffset, Size] = binary:split(Binary, <<":">>, [global]), + [IsTX, StartOffset, Length] = binary:split(Binary, <<":">>, [global]), case hb_util:bool(IsTX) of true -> load_bundle(ID, - hb_util:int(EndOffset), hb_util:int(Size), StoreOpts); + hb_util:int(StartOffset), hb_util:int(Length), StoreOpts); false -> load_item( - hb_util:int(EndOffset), hb_util:int(Size), StoreOpts) + hb_util:int(StartOffset), hb_util:int(Length), StoreOpts) end; not_found -> {error, not_found} end. -load_item(EndOffset, Size, Opts) -> - case read_chunks(EndOffset, Size, Opts) of +load_item(StartOffset, Length, Opts) -> + case read_chunks(StartOffset, Length, Opts) of {ok, SerializedItem} -> { ok, @@ -60,7 +57,7 @@ load_item(EndOffset, Size, Opts) -> {error, Reason} end. -load_bundle(ID, EndOffset, Size, Opts) -> +load_bundle(ID, StartOffset, Length, Opts) -> {ok, StructuredTXHeader} = hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"tx">>, <<"tx">> => ID, <<"exclude-data">> => true }, @@ -71,7 +68,7 @@ load_bundle(ID, EndOffset, Size, Opts) -> <<"tx@1.0">>, <<"structured@1.0">>, Opts), - case read_chunks(EndOffset, Size, Opts) of + case read_chunks(StartOffset, Length, Opts) of {ok, SerializedItem} -> { ok, @@ -86,31 +83,26 @@ load_bundle(ID, EndOffset, Size, Opts) -> {error, Reason} end. -read_chunks(EndOffset, Size, Opts) -> - StartOffset = EndOffset - Size + 1, +read_chunks(StartOffset, Length, Opts) -> hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"chunk">>, - <<"offset">> => StartOffset, - <<"length">> => Size + <<"offset">> => StartOffset + 1, + <<"length">> => Length }, Opts ). - -%% @doc When recording an item or bundle's offset we use its EndOffset and -%% Size in rather than StartOffset and Length - this is for consistency with -%% the Arweave API's `/tx//offset` endpoint which returns a global -%% end offset and size. -write_offset(#{ <<"index-store">> := IndexStore }, ID, IsTX, EndOffset, Size) -> +write_offset( + #{ <<"index-store">> := IndexStore }, ID, IsTX, StartOffset, Length) -> IsTxInt = hb_util:bool_int(IsTX), Value = << (hb_util:bin(IsTxInt))/binary, ":", - (hb_util:bin(EndOffset))/binary, + (hb_util:bin(StartOffset))/binary, ":", - (hb_util:bin(Size))/binary + (hb_util:bin(Length))/binary >>, hb_store:write(IndexStore, path(ID), Value). @@ -132,7 +124,8 @@ write_read_tx_test() -> ID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, EndOffset = 363524457284025, Size = 8387, - ok = write_offset(Opts, ID, true, EndOffset, Size), + StartOffset = EndOffset - Size, + ok = write_offset(Opts, ID, true, StartOffset, Size), {ok, Bundle} = read(Opts, ID), ?assert(hb_message:verify(Bundle, all, #{})), {ok, Child} = From e5c114deab46f532fe707f8df8ae34ed472582ac Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 16:42:01 -0500 Subject: [PATCH 15/29] fix: update how ~copycat@1.0 iterates over the range of blocks to be indexed - Old behavior: Count was exclusive and would keep going if `from` was less than `to`. e.g. `from=1000001&to=1000000` will index only block `1000001`, `from=999999&to=1000000` will index all blocks 999999 and lower. - New behavior: Count is inclusive and stops when `from` is less than `to`. e.g. from=1000001&to=1000000` will index blocks `1000001` and `1000000`, `from=999999&to=1000000` will index no blocks. --- src/dev_copycat_arweave.erl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 5964b1220..4cffca4ba 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -34,17 +34,17 @@ parse_range(Request, Opts) -> LatestHeight end, To = hb_maps:get(<<"to">>, Request, 0, Opts), - {From, To}. + {hb_util:int(From), hb_util:int(To)}. %% @doc Fetch blocks from an Arweave node between a given range. -fetch_blocks(Req, Current, Current, _Opts) -> +fetch_blocks(Req, Current, To, _Opts) when Current < To -> ?event(copycat_arweave, {arweave_block_indexing_completed, - {reached_target, Current}, + {reached_target, To}, {initial_request, Req} } ), - {ok, Current}; + {ok, To}; fetch_blocks(Req, Current, To, Opts) -> BlockRes = hb_ao:resolve( @@ -213,12 +213,12 @@ index_ids_test() -> arweave_index_ids => true, arweave_index_store => StoreOpts }, - {ok, 1827941} = hb_ao:resolve( + {ok, 1827942} = hb_ao:resolve( #{ <<"device">> => <<"copycat@1.0">> }, #{ <<"path">> => <<"arweave">>, - <<"from">> => 1827942, - <<"to">> => 1827941 + <<"from">> => <<"1827942">>, + <<"to">> => <<"1827942">> }, Opts ), From a2ed35d65a9ff7ef4d133c43bf48f7994aabe0e4 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 17:50:57 -0500 Subject: [PATCH 16/29] fix: support bundles with large headers --- src/ar_bundles.erl | 17 ++-- src/dev_arweave.erl | 1 - src/dev_copycat_arweave.erl | 181 ++++++++++++++++++++++++++++-------- 3 files changed, 153 insertions(+), 46 deletions(-) diff --git a/src/ar_bundles.erl b/src/ar_bundles.erl index a692e15dd..3d6b8efd3 100644 --- a/src/ar_bundles.erl +++ b/src/ar_bundles.erl @@ -5,7 +5,7 @@ -export([encode_tags/1, decode_tags/1]). -export([serialize/1, deserialize/1, serialize_bundle/3]). -export([data_item_signature_data/1]). --export([decode_bundle_header/1]). +-export([bundle_header_size/1, decode_bundle_header/1]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -468,7 +468,7 @@ find_single_layer(UnsignedID, Items) -> end. unbundle(<>) -> - {ItemsBin, Items, _HeaderSize} = decode_bundle_header(Count, Content), + {ItemsBin, Items} = decode_bundle_header(Count, Content), decode_bundle_items(Items, ItemsBin); unbundle(?DEFAULT_DATA) -> ?DEFAULT_DATA. @@ -488,19 +488,22 @@ decode_bundle_items([{_ID, Size} | RestItems], ItemsBin) -> ) ]. +bundle_header_size(<>) -> + % Eeach item in the bundle header index consumes 64 bytes + 32 + (Count * 64). + decode_bundle_header(<>) -> decode_bundle_header(Count, Content). -decode_bundle_header(Count, Bin) -> decode_bundle_header(Count, Bin, 32, []). -decode_bundle_header(0, ItemsBin, HeaderSize, Header) -> - {ItemsBin, lists:reverse(Header), HeaderSize}; +decode_bundle_header(Count, Bin) -> decode_bundle_header(Count, Bin, []). +decode_bundle_header(0, ItemsBin, Header) -> + {ItemsBin, lists:reverse(Header)}; decode_bundle_header( Count, <>, - HeaderSize, Header ) -> - decode_bundle_header(Count - 1, Rest, HeaderSize + 64, [{ID, Size} | Header]). + decode_bundle_header(Count - 1, Rest, [{ID, Size} | Header]). %% @doc Decode the signature from a binary format. Only RSA 4096 is currently supported. %% Note: the signature type '1' corresponds to RSA 4096 - but it is is written in diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index 2aefada0b..9173b9bfa 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -130,7 +130,6 @@ data(TXID, Opts) -> request(<<"GET">>, <<"/raw/", TXID/binary>>, Opts). chunk(Base, Request, Opts) -> - ?event(debug_test, {chunk, {base, Base}, {request, Request}, {opts, Opts}}), case hb_maps:get(<<"method">>, Request, <<"GET">>, Opts) of <<"POST">> -> post_chunk(Base, Request, Opts); <<"GET">> -> get_chunk_range(Base, Request, Opts) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 4cffca4ba..c19541308 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -110,8 +110,12 @@ maybe_index_ids(Block, Opts) -> TXStartOffset, TX#tx.data_size ), - {ok, {BundleIndex, HeaderSize}} = download_bundle_header( - TXEndOffset, TX#tx.data_size, Opts), + ?event(debug_test, {writing_bundle_offset, + {tx, {explicit, hb_util:encode(TX#tx.id)}} + }), + {ok, {BundleIndex, HeaderSize}} = + download_bundle_header( + TXEndOffset, TX#tx.data_size, Opts), lists:foldl( fun({ItemID, Size}, ItemStartOffset) -> hb_store_arweave:write_offset( @@ -138,7 +142,7 @@ is_bundle_tx(TX, _Opts) -> download_bundle_header(EndOffset, Size, Opts) -> StartOffset = EndOffset - Size + 1, - {ok, Chunk} = hb_ao:resolve( + {ok, FirstChunk} = hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9-pre">> }, #{ <<"path">> => <<"chunk">>, @@ -146,9 +150,31 @@ download_bundle_header(EndOffset, Size, Opts) -> }, Opts ), - {_ItemsBin, BundleIndex, HeaderSize} = ar_bundles:decode_bundle_header(Chunk), + % Most bundle headers can fit in a single chunk, but those with thousands + % of items might require multiple chunks to full represent the item + % index. + HeaderSize = ar_bundles:bundle_header_size(FirstChunk), + BundleHeader = + case HeaderSize =< byte_size(FirstChunk) of + true -> FirstChunk; + false -> + {ok, HeaderChunks} = hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => StartOffset, + <<"length">> => HeaderSize + }, + Opts + ), + HeaderChunks + end, + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(BundleHeader), {ok, {BundleIndex, HeaderSize}}. + + resolve_tx_headers(TXIDs, Opts) -> lists:map( fun(TXID) -> @@ -179,40 +205,7 @@ index_ids_test() -> %% signature type is not yet (as of Jan 2026) supported by ar_bundles.erl, %% however we should still be able to index it (we just can't deserialize %% it). - TestStore = hb_test_utils:test_store(), - StoreOpts = #{ <<"index-store">> => [TestStore] }, - Store = [ - TestStore, - #{ - <<"store-module">> => hb_store_fs, - <<"name">> => <<"cache-mainnet">> - }, - #{ - <<"store-module">> => hb_store_arweave, - <<"name">> => <<"cache-arweave">>, - <<"index-store">> => [TestStore], - <<"arweave-node">> => <<"https://arweave.net">> - }, - #{ - <<"store-module">> => hb_store_gateway, - <<"subindex">> => [ - #{ - <<"name">> => <<"Data-Protocol">>, - <<"value">> => <<"ao">> - } - ], - <<"local-store">> => [TestStore] - }, - #{ - <<"store-module">> => hb_store_gateway, - <<"local-store">> => [TestStore] - } - ], - Opts = #{ - store => Store, - arweave_index_ids => true, - arweave_index_store => StoreOpts - }, + {_TestStore, StoreOpts, Opts} = setup_index_opts(), {ok, 1827942} = hb_ao:resolve( #{ <<"device">> => <<"copycat@1.0">> }, #{ @@ -275,6 +268,118 @@ index_ids_test() -> ), ok. +bundle_header_index_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + TXID = <<"bnMTI7LglBGSaK5EdV_juh6GNtXLm0cd5lkd2q4nlT0">>, + {ok, #{ <<"body">> := OffsetBody }} = + hb_http:request( + #{ + <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, + <<"method">> => <<"GET">> + }, + Opts + ), + OffsetMsg = hb_json:decode(OffsetBody), + EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), + Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), + {ok, {BundleIndex, _HeaderSize}} = + download_bundle_header(EndOffset, Size, Opts), + ?assertEqual(15000, length(BundleIndex)), + ok. + +index_ids_perf() -> + {TestStore, _StoreOpts, Opts} = setup_index_opts(), + From = 1600100, + To = 1600000, + {IndexTimeUs, {ok, _}} = + timer:tc( + fun() -> + hb_ao:resolve( + #{ <<"device">> => <<"copycat@1.0">> }, + #{ + <<"path">> => <<"arweave">>, + <<"from">> => hb_util:bin(From), + <<"to">> => hb_util:bin(To) + }, + Opts + ) + end + ), + BlockCount = From - To + 1, + {TxCount, ItemCount} = count_index_entries(TestStore), + hb_format:eunit_print( + "Indexed ~s blocks in ~p ms (~s txs, ~s items)", + [ + hb_util:human_int(BlockCount), + IndexTimeUs / 1000, + hb_util:human_int(TxCount), + hb_util:human_int(ItemCount) + ] + ), + ok. + +setup_index_opts() -> + TestStore = hb_test_utils:test_store(), + StoreOpts = #{ <<"index-store">> => [TestStore] }, + Store = [ + TestStore, + #{ + <<"store-module">> => hb_store_fs, + <<"name">> => <<"cache-mainnet">> + }, + #{ + <<"store-module">> => hb_store_arweave, + <<"name">> => <<"cache-arweave">>, + <<"index-store">> => [TestStore], + <<"arweave-node">> => <<"https://arweave.net">> + }, + #{ + <<"store-module">> => hb_store_gateway, + <<"subindex">> => [ + #{ + <<"name">> => <<"Data-Protocol">>, + <<"value">> => <<"ao">> + } + ], + <<"local-store">> => [TestStore] + }, + #{ + <<"store-module">> => hb_store_gateway, + <<"local-store">> => [TestStore] + } + ], + Opts = #{ + store => Store, + arweave_index_ids => true, + arweave_index_store => StoreOpts + }, + {TestStore, StoreOpts, Opts}. + +count_index_entries(IndexStore) -> + case hb_store:list(IndexStore, ?ARWEAVE_INDEX_PATH) of + {ok, Keys} -> + lists:foldl( + fun(Key, {TxCount, ItemCount}) -> + Path = <>, + case hb_store:read(IndexStore, Path) of + {ok, Value} -> + [IsTx | _] = + binary:split(Value, <<":">>, [global]), + case hb_util:bool(IsTx) of + true -> {TxCount + 1, ItemCount}; + false -> {TxCount, ItemCount + 1} + end; + _ -> + {TxCount, ItemCount} + end + end, + {0, 0}, + Keys + ); + not_found -> + {0, 0} + end. + assert_bundle_read(Opts, BundleID, ExpectedItems) -> ReadItems = lists:map( From 13a2eb31ac6bfe9e5970fd97827cbb1280146fac Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 20:44:45 -0500 Subject: [PATCH 17/29] fix: skip L1 TXs that are signed with ECDSA --- src/dev_copycat_arweave.erl | 117 +++++++++++++++--------------------- 1 file changed, 47 insertions(+), 70 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index c19541308..2d322d49a 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -110,9 +110,6 @@ maybe_index_ids(Block, Opts) -> TXStartOffset, TX#tx.data_size ), - ?event(debug_test, {writing_bundle_offset, - {tx, {explicit, hb_util:encode(TX#tx.id)}} - }), {ok, {BundleIndex, HeaderSize}} = download_bundle_header( TXEndOffset, TX#tx.data_size, Opts), @@ -176,26 +173,41 @@ download_bundle_header(EndOffset, Size, Opts) -> resolve_tx_headers(TXIDs, Opts) -> - lists:map( - fun(TXID) -> - {ok, StructuredTXHeader} = - hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"tx">>, - <<"tx">> => TXID, - <<"exclude-data">> => true - }, - Opts - ), + lists:foldr( + fun(TXID, Acc) -> + case resolve_tx_header(TXID, Opts) of + {ok, TX} -> [TX | Acc]; + skip -> Acc + end + end, + [], + TXIDs + ). + +resolve_tx_header(TXID, Opts) -> + try + {ok, StructuredTXHeader} = + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9-pre">> }, + #{ + <<"path">> => <<"tx">>, + <<"tx">> => TXID, + <<"exclude-data">> => true + }, + Opts + ), + {ok, hb_message:convert( StructuredTXHeader, <<"tx@1.0">>, <<"structured@1.0">>, - Opts) - end, - TXIDs - ). + Opts)} + catch + error:{badmatch, {ecdsa, secp256k1}} -> + % Skip for now (until HyperBEAM supports Arweave L1 TXs + % ith ECDSA signatures) + skip + end. %%% Tests @@ -287,33 +299,23 @@ bundle_header_index_test() -> ?assertEqual(15000, length(BundleIndex)), ok. -index_ids_perf() -> - {TestStore, _StoreOpts, Opts} = setup_index_opts(), - From = 1600100, - To = 1600000, - {IndexTimeUs, {ok, _}} = - timer:tc( - fun() -> - hb_ao:resolve( - #{ <<"device">> => <<"copycat@1.0">> }, - #{ - <<"path">> => <<"arweave">>, - <<"from">> => hb_util:bin(From), - <<"to">> => hb_util:bin(To) - }, - Opts - ) - end - ), - BlockCount = From - To + 1, - {TxCount, ItemCount} = count_index_entries(TestStore), - hb_format:eunit_print( - "Indexed ~s blocks in ~p ms (~s txs, ~s items)", +index_ids_ecdsa_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827904} = hb_ao:resolve( + #{ <<"device">> => <<"copycat@1.0">> }, + #{ + <<"path">> => <<"arweave">>, + <<"from">> => <<"1827904">>, + <<"to">> => <<"1827904">> + }, + Opts + ), + assert_bundle_read( + Opts, + <<"VNhX_pSANk_8j0jZBR5bh_5jr-lkfbHDjtHd8FKqx7U">>, [ - hb_util:human_int(BlockCount), - IndexTimeUs / 1000, - hb_util:human_int(TxCount), - hb_util:human_int(ItemCount) + {<<"3xDKhrCQcPuBtcm1ipZS5C9gAfFYClgHuHOHAXGfchM">>, <<"1">>}, + {<<"JantC8f89VE-RidArHnU9589gY5T37NDXnWpI7H_psc">>, <<"7">>} ] ), ok. @@ -355,31 +357,6 @@ setup_index_opts() -> }, {TestStore, StoreOpts, Opts}. -count_index_entries(IndexStore) -> - case hb_store:list(IndexStore, ?ARWEAVE_INDEX_PATH) of - {ok, Keys} -> - lists:foldl( - fun(Key, {TxCount, ItemCount}) -> - Path = <>, - case hb_store:read(IndexStore, Path) of - {ok, Value} -> - [IsTx | _] = - binary:split(Value, <<":">>, [global]), - case hb_util:bool(IsTx) of - true -> {TxCount + 1, ItemCount}; - false -> {TxCount, ItemCount + 1} - end; - _ -> - {TxCount, ItemCount} - end - end, - {0, 0}, - Keys - ); - not_found -> - {0, 0} - end. - assert_bundle_read(Opts, BundleID, ExpectedItems) -> ReadItems = lists:map( From 0c71ec6bd47d37fee695a63e107ff9d96baed1be Mon Sep 17 00:00:00 2001 From: James Piechota Date: Mon, 26 Jan 2026 20:56:07 -0500 Subject: [PATCH 18/29] impr: log count of items indexed in each block --- src/dev_copycat_arweave.erl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 2d322d49a..d7bb0a4bd 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -62,11 +62,12 @@ fetch_blocks(Req, Current, To, Opts) -> process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of {ok, Block} -> - maybe_index_ids(Block, Opts), + IndexedItems = maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, {height, Current}, + {indexed_items, IndexedItems}, {target, To} } ); @@ -83,7 +84,7 @@ process_block(BlockRes, _Req, Current, To, Opts) -> %% @doc Index the IDs of all transactions in the block if configured to do so. maybe_index_ids(Block, Opts) -> case hb_opts:get(arweave_index_ids, false, Opts) of - false -> ok; + false -> 0; true -> IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), BlockEndOffset = hb_util:int( @@ -94,12 +95,12 @@ maybe_index_ids(Block, Opts) -> TXs = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - lists:foreach(fun + lists:sum(lists:map(fun ({{padding, _PaddingRoot}, _EndOffset}) -> - ok; + 0; ({{TX, _TXDataRoot}, EndOffset}) -> case is_bundle_tx(TX, Opts) of - false -> ok; + false -> 0; true -> TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, @@ -126,12 +127,12 @@ maybe_index_ids(Block, Opts) -> end, TXStartOffset + HeaderSize, BundleIndex - ) + ), + length(BundleIndex) end end, TXsWithData - ), - ok + )) end. is_bundle_tx(TX, _Opts) -> From 93ccc5b125ffb4c869689834416c3e6953a3c097 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Tue, 27 Jan 2026 11:23:10 -0500 Subject: [PATCH 19/29] chore: cleanup --- src/ar_bundles.erl | 36 +++++++- src/dev_copycat_arweave.erl | 162 ++++++++++++++++++------------------ 2 files changed, 118 insertions(+), 80 deletions(-) diff --git a/src/ar_bundles.erl b/src/ar_bundles.erl index 3d6b8efd3..9b72115ef 100644 --- a/src/ar_bundles.erl +++ b/src/ar_bundles.erl @@ -493,7 +493,9 @@ bundle_header_size(<>) -> 32 + (Count * 64). decode_bundle_header(<>) -> - decode_bundle_header(Count, Content). + decode_bundle_header(Count, Content); +decode_bundle_header(<<>>) -> + {<<>>, []}. decode_bundle_header(Count, Bin) -> decode_bundle_header(Count, Bin, []). decode_bundle_header(0, ItemsBin, Header) -> @@ -655,6 +657,38 @@ with_zero_length_tag_test() -> Deserialized = deserialize(Serialized), ?assertEqual(Item, Deserialized). +decode_bundle_header_test() -> + ?assertEqual({<<>>, []}, decode_bundle_header(<<>>)), + Tail = <<"tail">>, + ?assertEqual( + {Tail, []}, + decode_bundle_header(<<0:256/little, Tail/binary>>) + ), + ID1 = crypto:strong_rand_bytes(32), + Items1 = <<"abcde">>, + ?assertEqual( + {Items1, [{ID1, 5}]}, + decode_bundle_header(<<1:256/little, 5:256/little, ID1:32/binary, Items1/binary>>) + ), + ID2 = crypto:strong_rand_bytes(32), + ID3 = crypto:strong_rand_bytes(32), + Items2 = <<"payload">>, + ?assertEqual( + {Items2, [{ID2, 4}, {ID3, 2}]}, + decode_bundle_header( + << + 2:256/little, + 4:256/little, ID2:32/binary, + 2:256/little, ID3:32/binary, + Items2/binary + >> + ) + ), + ?assertEqual( + {<<>>, [{ID1, 6}]}, + decode_bundle_header(<<1:256/little, 6:256/little, ID1:32/binary>>) + ). + unsigned_data_item_id_test() -> Item1 = deserialize( serialize( diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index d7bb0a4bd..4d7534104 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -7,9 +7,9 @@ -export([arweave/3]). -include_lib("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). +-include_lib("public_key/include/OTP-PUB-KEY.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9-pre">>). --define(ARWEAVE_INDEX_PATH, <>). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -26,12 +26,13 @@ parse_range(Request, Opts) -> case hb_maps:find(<<"from">>, Request, Opts) of {ok, Height} -> Height; error -> - {ok, LatestHeight} = - hb_ao:resolve( - <>, - Opts - ), - LatestHeight + case hb_ao:resolve( + <>, + Opts + ) of + {ok, LatestHeight} -> LatestHeight; + {error, _} -> 0 + end end, To = hb_maps:get(<<"to">>, Request, 0, Opts), {hb_util:int(From), hb_util:int(To)}. @@ -71,13 +72,13 @@ process_block(BlockRes, _Req, Current, To, Opts) -> {target, To} } ); - {error, not_found} -> + {error, _} = Error -> ?event( copycat_short, {arweave_block_not_found, {height, Current}, - {target, To} - } + {target, To}, + {reason, Error}} ) end. @@ -111,9 +112,10 @@ maybe_index_ids(Block, Opts) -> TXStartOffset, TX#tx.data_size ), - {ok, {BundleIndex, HeaderSize}} = + {ok, {BundleIndex, HeaderSize}} = download_bundle_header( - TXEndOffset, TX#tx.data_size, Opts), + TXEndOffset, TX#tx.data_size, Opts + ), lists:foldl( fun({ItemID, Size}, ItemStartOffset) -> hb_store_arweave:write_offset( @@ -140,38 +142,45 @@ is_bundle_tx(TX, _Opts) -> download_bundle_header(EndOffset, Size, Opts) -> StartOffset = EndOffset - Size + 1, - {ok, FirstChunk} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => StartOffset - }, + case hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/chunk&offset=", + (hb_util:bin(StartOffset))/binary + >>, Opts - ), - % Most bundle headers can fit in a single chunk, but those with thousands - % of items might require multiple chunks to full represent the item - % index. - HeaderSize = ar_bundles:bundle_header_size(FirstChunk), - BundleHeader = - case HeaderSize =< byte_size(FirstChunk) of - true -> FirstChunk; - false -> - {ok, HeaderChunks} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"chunk">>, - <<"offset">> => StartOffset, - <<"length">> => HeaderSize - }, - Opts - ), - HeaderChunks - end, - {_ItemsBin, BundleIndex} = - ar_bundles:decode_bundle_header(BundleHeader), - {ok, {BundleIndex, HeaderSize}}. - + ) of + {ok, FirstChunk} -> + % Most bundle headers can fit in a single chunk, but those with + % thousands of items might require multiple chunks to fully + % represent the item index. + HeaderSize = ar_bundles:bundle_header_size(FirstChunk), + {ok, BundleHeader} = + header_chunk(HeaderSize, FirstChunk, StartOffset, Opts), + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(BundleHeader), + {ok, {BundleIndex, HeaderSize}}; + {error, _} -> + {ok, {[], 0}} + end. +header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) + when HeaderSize =< byte_size(FirstChunk) -> + {ok, FirstChunk}; +header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> + case hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/chunk&offset=", + (hb_util:bin(StartOffset))/binary, + "&length=", + (hb_util:bin(HeaderSize))/binary + >>, + Opts + ) of + {ok, HeaderChunk} -> {ok, HeaderChunk}; + {error, _} -> {ok, <<>>} + end. resolve_tx_headers(TXIDs, Opts) -> lists:foldr( @@ -187,29 +196,33 @@ resolve_tx_headers(TXIDs, Opts) -> resolve_tx_header(TXID, Opts) -> try - {ok, StructuredTXHeader} = - hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9-pre">> }, - #{ - <<"path">> => <<"tx">>, - <<"tx">> => TXID, - <<"exclude-data">> => true - }, - Opts - ), - {ok, - hb_message:convert( - StructuredTXHeader, - <<"tx@1.0">>, - <<"structured@1.0">>, - Opts)} + case hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/tx&tx=", + TXID/binary, + "&exclude-data=true" + >>, + Opts + ) of + {ok, StructuredTXHeader} -> + {ok, + hb_message:convert( + StructuredTXHeader, + <<"tx@1.0">>, + <<"structured@1.0">>, + Opts)}; + {error, _} -> + skip + end catch error:{badmatch, {ecdsa, secp256k1}} -> - % Skip for now (until HyperBEAM supports Arweave L1 TXs - % ith ECDSA signatures) + % Skip for now (until HyperBEAM supports Arweave L1 TXs + % with ECDSA signatures) skip end. + %%% Tests index_ids_test() -> @@ -219,15 +232,11 @@ index_ids_test() -> %% however we should still be able to index it (we just can't deserialize %% it). {_TestStore, StoreOpts, Opts} = setup_index_opts(), - {ok, 1827942} = hb_ao:resolve( - #{ <<"device">> => <<"copycat@1.0">> }, - #{ - <<"path">> => <<"arweave">>, - <<"from">> => <<"1827942">>, - <<"to">> => <<"1827942">> - }, - Opts - ), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + Opts + ), % WbRAQbeyjPHgopBKyi0PLeKWvYZr3rgZvQ7QY3ASJS4 is a bundle signed with % an Ethereum signature which is not supported by HB as of Jan 2026. % The bundle should be indexed even though we can't deserialized the @@ -302,15 +311,11 @@ bundle_header_index_test() -> index_ids_ecdsa_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - {ok, 1827904} = hb_ao:resolve( - #{ <<"device">> => <<"copycat@1.0">> }, - #{ - <<"path">> => <<"arweave">>, - <<"from">> => <<"1827904">>, - <<"to">> => <<"1827904">> - }, - Opts - ), + {ok, 1827904} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827904&to=1827904">>, + Opts + ), assert_bundle_read( Opts, <<"VNhX_pSANk_8j0jZBR5bh_5jr-lkfbHDjtHd8FKqx7U">>, @@ -381,4 +386,3 @@ assert_item_read(Opts, ItemID) -> ?assert(hb_message:verify(Item, all, Opts)), ?assertEqual(ItemID, hb_message:id(Item, signed)), Item. - From 7ba7275c959707a48f623abbf5aab27ebc477d98 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Tue, 27 Jan 2026 13:15:12 -0500 Subject: [PATCH 20/29] fix: allow RSA signatures of varying bitsize Rarely we find non-4096 bit RSA-signed transactions in the blockchain --- src/dev_codec_tx.erl | 52 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/src/dev_codec_tx.erl b/src/dev_codec_tx.erl index d68f76681..1cc85a760 100644 --- a/src/dev_codec_tx.erl +++ b/src/dev_codec_tx.erl @@ -197,7 +197,7 @@ enforce_valid_tx(TX) -> {invalid_field, anchor, TX#tx.anchor} ), hb_util:ok_or_throw(TX, - hb_util:check_size(TX#tx.owner, [byte_size(?DEFAULT_OWNER)]), + hb_util:check_type(TX#tx.owner, binary), {invalid_field, owner, TX#tx.owner} ), hb_util:ok_or_throw(TX, @@ -217,7 +217,7 @@ enforce_valid_tx(TX) -> {invalid_field, data_root, TX#tx.data_root} ), hb_util:ok_or_throw(TX, - hb_util:check_size(TX#tx.signature, [65, byte_size(?DEFAULT_SIG)]), + hb_util:check_type(TX#tx.signature, binary), {invalid_field, signature, TX#tx.signature} ), hb_util:ok_or_throw(TX, @@ -294,20 +294,14 @@ enforce_valid_tx_test() -> {unsigned_id_invalid_val, BaseTX#tx{unsigned_id = InvalidUnsignedID}, {invalid_field, unsigned_id, InvalidUnsignedID}}, {anchor_too_short_31, BaseTX#tx{anchor = BadID31}, {invalid_field, anchor, BadID31}}, {anchor_too_long_33, BaseTX#tx{anchor = BadID33}, {invalid_field, anchor, BadID33}}, - {owner_wrong_size, BaseTX#tx{owner = BadOwnerSize}, {invalid_field, owner, BadOwnerSize}}, - {owner_empty, BaseTX#tx{owner = <<>>}, {invalid_field, owner, <<>>}}, + {owner_wrong_type, BaseTX#tx{owner = "hello"}, {invalid_field, owner, "hello"}}, {target_too_short_31, BaseTX#tx{target = BadID31}, {invalid_field, target, BadID31}}, {target_too_long_33, BaseTX#tx{target = BadID33}, {invalid_field, target, BadID33}}, {quantity_not_integer, BaseTX#tx{quantity = <<"100">>}, {invalid_field, quantity, <<"100">>}}, {data_size_not_integer, BaseTX#tx{data_size = an_atom}, {invalid_field, data_size, an_atom}}, {data_root_too_short_31, BaseTX#tx{data_root = BadID31}, {invalid_field, data_root, BadID31}}, {data_root_too_long_33, BaseTX#tx{data_root = BadID33}, {invalid_field, data_root, BadID33}}, - {signature_invalid_size_1, BaseTX#tx{signature = SigInvalidSize1}, {invalid_field, signature, SigInvalidSize1}}, - {signature_invalid_size_64, BaseTX#tx{signature = SigInvalidSize64}, {invalid_field, signature, SigInvalidSize64}}, - {signature_invalid_size_66, BaseTX#tx{signature = SigInvalidSize66}, {invalid_field, signature, SigInvalidSize66}}, - {signature_invalid_size_511, BaseTX#tx{signature = SigInvalidSize511}, {invalid_field, signature, SigInvalidSize511}}, - {signature_too_long_513, BaseTX#tx{signature = SigTooLong513}, {invalid_field, signature, SigTooLong513}}, - {signature_empty, BaseTX#tx{signature = <<>>}, {invalid_field, signature, <<>>}}, + {signature_invalid_type, BaseTX#tx{signature = "hello"}, {invalid_field, signature, "hello"}}, {reward_not_integer, BaseTX#tx{reward = 1.0}, {invalid_field, reward, 1.0}}, {denomination_not_zero, BaseTX#tx{denomination = 1}, {invalid_field, denomination, 1}}, {signature_type_not_rsa, BaseTX#tx{signature_type = ?ECDSA_KEY_TYPE}, {invalid_field, signature_type, ?ECDSA_KEY_TYPE}}, @@ -786,6 +780,12 @@ real_ecdsa_single_item_bundle_tx_test_disabled() -> [] ). +real_2048_bit_rsa_tx_test() -> + do_real_tx_verify( + <<"tj76flZk936u0S2owyEzUFBvBAYle9Al5LH8zJ7icNc">>, + [<<"tj76flZk936u0S2owyEzUFBvBAYle9Al5LH8zJ7icNc">>] + ). + real_no_data_tx_test() -> do_real_tx_verify( <<"N1Cyu67lQtmZMQlIZVFpNfy3xz6k9wEZ8LLeDbOebbk">>, @@ -825,24 +825,46 @@ do_real_tx_verify(TXID, ExpectedIDs) -> end, ?event(debug_test, {tx, {explicit, TX}}), ?assert(ar_tx:verify(TX)), - + StructuredTX = hb_message:convert( + TX, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ), + ?assert(hb_message:verify(StructuredTX, all, #{})), Deserialized = ar_bundles:deserialize(TX), ?event(debug_test, {deserialized}), + verify_items(Deserialized, ExpectedIDs, Opts). - verify_items(Deserialized, ExpectedIDs). - -verify_items(RootItem, ExpectedIDs) -> +verify_items(RootItem, ExpectedIDs, Opts) -> AllItems = flatten_items(RootItem), ?assertEqual(length(ExpectedIDs), length(AllItems)), [RootItem | NestedItems] = AllItems, [RootID | NestedIDs] = ExpectedIDs, + NormalizedRootItem = dev_arweave_common:normalize(RootItem), ?assert( - ar_tx:verify(dev_arweave_common:normalize(RootItem)), + ar_tx:verify(NormalizedRootItem), + hb_util:encode(RootItem#tx.id)), + StructuredRootItem = hb_message:convert( + NormalizedRootItem, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ), + ?assert(hb_message:verify(StructuredRootItem, all, Opts), hb_util:encode(RootItem#tx.id)), ?assertEqual(RootID, hb_util:encode(RootItem#tx.id)), lists:zipwith( fun(Item, ExpectedID) -> ?assert(ar_bundles:verify_item(Item), hb_util:encode(Item#tx.id)), + StructuredItem = hb_message:convert( + Item, + <<"structured@1.0">>, + <<"ans104@1.0">>, + Opts + ), + ?assert(hb_message:verify(StructuredItem, all, Opts), + hb_util:encode(Item#tx.id)), ?assertEqual(ExpectedID, hb_util:encode(Item#tx.id)) end, NestedItems, From cb101c87f04c17bd18cf8d285b68b304d0fe5632 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Tue, 27 Jan 2026 15:36:08 -0500 Subject: [PATCH 21/29] fix: skip transactions with non-string tags --- src/dev_arweave_common.erl | 15 +++++++++++++++ src/dev_copycat_arweave.erl | 18 +++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/dev_arweave_common.erl b/src/dev_arweave_common.erl index 1e984ec20..9f784f839 100644 --- a/src/dev_arweave_common.erl +++ b/src/dev_arweave_common.erl @@ -197,4 +197,19 @@ normalize_data_root(Item = #tx{data = Bin, format = 2}) Item#tx{data_root = ar_tx:data_root(arweavejs, Bin)}; normalize_data_root(Item) -> Item. +%%%=================================================================== +%%% Tests. +%%%=================================================================== + +tagfind_test() -> + Default = <<"default">>, + ?assertEqual( + <<"v1">>, + tagfind(<<"Foo">>, [{<<"fOo">>, <<"v1">>}], Default) + ), + ?assertEqual( + Default, + tagfind(<<"Missing">>, [{<<"foo">>, <<"v">>}], Default) + ). + diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 4d7534104..e08cdb667 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -216,9 +216,15 @@ resolve_tx_header(TXID, Opts) -> skip end catch - error:{badmatch, {ecdsa, secp256k1}} -> - % Skip for now (until HyperBEAM supports Arweave L1 TXs - % with ECDSA signatures) + Class:Reason:_ -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, TXID}}, + {class, Class}, + {reason, Reason} + } + ), skip end. @@ -326,6 +332,12 @@ index_ids_ecdsa_test() -> ), ok. +non_string_tags_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + ?assertEqual(skip, + resolve_tx_header(<<"752P6t4cOjMabYHqzC6hyLhxyo4YKZLblg7va_J21YE">>, Opts)), + ok. + setup_index_opts() -> TestStore = hb_test_utils:test_store(), StoreOpts = #{ <<"index-store">> => [TestStore] }, From 59d9906b8384283a2ca92517022bc4c94ad4d632 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Wed, 28 Jan 2026 19:23:36 +0000 Subject: [PATCH 22/29] impr: allow hb_http_client to retry on HTTP response --- src/dev_arweave.erl | 5 ++++- src/hb_http.erl | 16 ++++------------ src/hb_http_client.erl | 32 ++++++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/dev_arweave.erl b/src/dev_arweave.erl index 9173b9bfa..0eac81db1 100644 --- a/src/dev_arweave.erl +++ b/src/dev_arweave.erl @@ -308,7 +308,10 @@ request(Method, Path, Opts) -> <<"path">> => <<"/arweave", Path/binary>>, <<"method">> => Method }, - Opts + Opts#{ + http_retry => 3, + http_retry_response => [error, failure] + } ), to_message(Path, Res, Opts). diff --git a/src/hb_http.erl b/src/hb_http.erl index 2f20d1764..9dc6acc12 100644 --- a/src/hb_http.erl +++ b/src/hb_http.erl @@ -180,7 +180,7 @@ request(Method, Peer, Path, RawMessage, Opts) -> ), case {Key, hb_maps:get(Key, Msg, undefined, Opts)} of {<<"body">>, undefined} -> - {response_status_to_atom(Status), <<>>}; + {hb_http_client:response_status_to_atom(Status), <<>>}; {_, undefined} -> {failure, << @@ -195,7 +195,7 @@ request(Method, Peer, Path, RawMessage, Opts) -> >> }; {_, Value} -> - {response_status_to_atom(Status), Value} + {hb_http_client:response_status_to_atom(Status), Value} end; false -> % Find the codec device from the headers, if set. @@ -215,14 +215,6 @@ request(Method, Peer, Path, RawMessage, Opts) -> ) end. -%% @doc Convert a HTTP status code to a status atom. -response_status_to_atom(Status) -> - case Status of - 201 -> created; - X when X < 400 -> ok; - X when X < 500 -> error; - _ -> failure - end. %% @doc Convert an HTTP response to a message. outbound_result_to_message(<<"ans104@1.0">>, Status, Headers, Body, Opts) -> @@ -233,7 +225,7 @@ outbound_result_to_message(<<"ans104@1.0">>, Status, Headers, Body, Opts) -> try ar_bundles:deserialize(Body) of Deserialized -> { - response_status_to_atom(Status), + hb_http_client:response_status_to_atom(Status), hb_message:convert( Deserialized, <<"structured@1.0">>, @@ -259,7 +251,7 @@ outbound_result_to_message(<<"ans104@1.0">>, Status, Headers, Body, Opts) -> outbound_result_to_message(<<"httpsig@1.0">>, Status, Headers, Body, Opts) -> ?event(http_outbound, {result_is_httpsig, {body, Body}}, Opts), { - response_status_to_atom(Status), + hb_http_client:response_status_to_atom(Status), http_response_to_httpsig(Status, Headers, Body, Opts) }. diff --git a/src/hb_http_client.erl b/src/hb_http_client.erl index 3611cc088..4ee9f008b 100644 --- a/src/hb_http_client.erl +++ b/src/hb_http_client.erl @@ -3,7 +3,7 @@ -module(hb_http_client). -behaviour(gen_server). -include("include/hb.hrl"). --export([start_link/1, request/2]). +-export([start_link/1, response_status_to_atom/1, request/2]). -export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). -record(state, { @@ -24,12 +24,28 @@ start_link(Opts) -> gen_server:start_link({local, ?MODULE}, ?MODULE, Opts, []). +%% @doc Convert a HTTP status code to a status atom. +response_status_to_atom(Status) -> + case Status of + 201 -> created; + X when X < 400 -> ok; + X when X < 500 -> error; + _ -> failure + end. + request(Args, Opts) -> request(Args, hb_opts:get(http_retry, ?DEFAULT_RETRIES, Opts), Opts). request(Args, RemainingRetries, Opts) -> - case do_request(Args, Opts) of - {error, Details} -> maybe_retry(RemainingRetries, Args, Details, Opts); - {ok, Status, Headers, Body} -> {ok, Status, Headers, Body} + Response = do_request(Args, Opts), + case Response of + {error, _Details} -> maybe_retry(RemainingRetries, Args, Response, Opts); + {ok, Status, _Headers, _Body} -> + StatusAtom = response_status_to_atom(Status), + RetryResponses = hb_opts:get(http_retry_response, [], Opts), + case lists:member(StatusAtom, RetryResponses) of + true -> maybe_retry(RemainingRetries, Args, Response, Opts); + false -> Response + end end. do_request(Args, Opts) -> @@ -38,8 +54,8 @@ do_request(Args, Opts) -> httpc -> httpc_req(Args, Opts) end. -maybe_retry(0, _, ErrDetails, _) -> {error, ErrDetails}; -maybe_retry(Remaining, Args, ErrDetails, Opts) -> +maybe_retry(0, _, OriginalResponse, _) -> OriginalResponse; +maybe_retry(Remaining, Args, OriginalResponse, Opts) -> RetryBaseTime = hb_opts:get(http_retry_time, ?DEFAULT_RETRY_TIME, Opts), RetryTime = case hb_opts:get(http_retry_mode, backoff, Opts) of @@ -48,6 +64,10 @@ maybe_retry(Remaining, Args, ErrDetails, Opts) -> BaseRetries = hb_opts:get(http_retry, ?DEFAULT_RETRIES, Opts), RetryBaseTime * (1 + (BaseRetries - Remaining)) end, + ErrDetails = case OriginalResponse of + {error, Details} -> Details; + {ok, Status, _, _} -> Status + end, ?event( warning, {retrying_http_request, From dbb746031721102a436199a4f88a576fc3aaa05b Mon Sep 17 00:00:00 2001 From: James Piechota Date: Wed, 28 Jan 2026 21:29:49 +0000 Subject: [PATCH 23/29] fix: track and log skipped transactions while indexing --- src/dev_copycat_arweave.erl | 111 +++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 45 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index e08cdb667..655762ed4 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -63,12 +63,13 @@ fetch_blocks(Req, Current, To, Opts) -> process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of {ok, Block} -> - IndexedItems = maybe_index_ids(Block, Opts), + {IndexedItems, SkippedTxs} = maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, {height, Current}, {indexed_items, IndexedItems}, + {skipped_txs, SkippedTxs}, {target, To} } ); @@ -85,7 +86,7 @@ process_block(BlockRes, _Req, Current, To, Opts) -> %% @doc Index the IDs of all transactions in the block if configured to do so. maybe_index_ids(Block, Opts) -> case hb_opts:get(arweave_index_ids, false, Opts) of - false -> 0; + false -> {0, 0}; true -> IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), BlockEndOffset = hb_util:int( @@ -93,48 +94,61 @@ maybe_index_ids(Block, Opts) -> BlockSize = hb_util:int( hb_maps:get(<<"block_size">>, Block, 0, Opts)), BlockStartOffset = BlockEndOffset - BlockSize, - TXs = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), + {TXs, SkippedFromHeaders} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - lists:sum(lists:map(fun - ({{padding, _PaddingRoot}, _EndOffset}) -> - 0; - ({{TX, _TXDataRoot}, EndOffset}) -> + {IndexedItems, SkippedFromBundles} = lists:foldl(fun + ({{padding, _PaddingRoot}, _EndOffset}, {ItemsAcc, SkippedAcc}) -> + {ItemsAcc, SkippedAcc}; + ({{TX, _TXDataRoot}, EndOffset}, {ItemsAcc, SkippedAcc}) -> case is_bundle_tx(TX, Opts) of - false -> 0; + false -> {ItemsAcc, SkippedAcc}; true -> + TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, hb_store_arweave:write_offset( IndexStore, - hb_util:encode(TX#tx.id), + TXID, true, TXStartOffset, TX#tx.data_size ), - {ok, {BundleIndex, HeaderSize}} = - download_bundle_header( - TXEndOffset, TX#tx.data_size, Opts - ), - lists:foldl( - fun({ItemID, Size}, ItemStartOffset) -> - hb_store_arweave:write_offset( - IndexStore, - hb_util:encode(ItemID), - false, - ItemStartOffset, - Size + case download_bundle_header( + TXEndOffset, TX#tx.data_size, Opts + ) of + {ok, {BundleIndex, HeaderSize}} -> + _ = lists:foldl( + fun({ItemID, Size}, ItemStartOffset) -> + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + false, + ItemStartOffset, + Size + ), + ItemStartOffset + Size + end, + TXStartOffset + HeaderSize, + BundleIndex ), - ItemStartOffset + Size - end, - TXStartOffset + HeaderSize, - BundleIndex - ), - length(BundleIndex) + {ItemsAcc + length(BundleIndex), SkippedAcc}; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, TXID}}, + {reason, Reason} + } + ), + {ItemsAcc, SkippedAcc + 1} + end end end, + {0, 0}, TXsWithData - )) + ), + {IndexedItems, SkippedFromHeaders + SkippedFromBundles} end. is_bundle_tx(TX, _Opts) -> @@ -155,20 +169,23 @@ download_bundle_header(EndOffset, Size, Opts) -> % thousands of items might require multiple chunks to fully % represent the item index. HeaderSize = ar_bundles:bundle_header_size(FirstChunk), - {ok, BundleHeader} = - header_chunk(HeaderSize, FirstChunk, StartOffset, Opts), - {_ItemsBin, BundleIndex} = - ar_bundles:decode_bundle_header(BundleHeader), - {ok, {BundleIndex, HeaderSize}}; - {error, _} -> - {ok, {[], 0}} + case header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) of + {ok, BundleHeader} -> + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(BundleHeader), + {ok, {BundleIndex, HeaderSize}}; + Error -> + Error + end; + Error -> + Error end. header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) when HeaderSize =< byte_size(FirstChunk) -> {ok, FirstChunk}; header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> - case hb_ao:resolve( + hb_ao:resolve( << ?ARWEAVE_DEVICE/binary, "/chunk&offset=", @@ -177,20 +194,17 @@ header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> (hb_util:bin(HeaderSize))/binary >>, Opts - ) of - {ok, HeaderChunk} -> {ok, HeaderChunk}; - {error, _} -> {ok, <<>>} - end. + ). resolve_tx_headers(TXIDs, Opts) -> lists:foldr( - fun(TXID, Acc) -> + fun(TXID, {Acc, SkippedAcc}) -> case resolve_tx_header(TXID, Opts) of - {ok, TX} -> [TX | Acc]; - skip -> Acc + {ok, TX} -> {[TX | Acc], SkippedAcc}; + skip -> {Acc, SkippedAcc + 1} end end, - [], + {[], 0}, TXIDs ). @@ -212,7 +226,14 @@ resolve_tx_header(TXID, Opts) -> <<"tx@1.0">>, <<"structured@1.0">>, Opts)}; - {error, _} -> + {error, ResolveError} -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, TXID}}, + {reason, ResolveError} + } + ), skip end catch From 1742301c8820ee1d91149055a343cca444ecb5b0 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 29 Jan 2026 00:44:51 +0000 Subject: [PATCH 24/29] fix: don't set the arweave index options in the default hb_opts An example config.json: ``` { "arweave_index_ids": true, "arweave_index_store": { "index-store": [ { "store-module": "hb_store_lmdb", "name": "cache-mainnet/lmdb", "ao-types": "store-module=\"atom\"" } ] } } ``` --- src/dev_copycat_arweave.erl | 4 ++-- src/hb_opts.erl | 4 +--- src/hb_store_arweave.erl | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 655762ed4..71ed5602f 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -107,7 +107,7 @@ maybe_index_ids(Block, Opts) -> TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, - hb_store_arweave:write_offset( + ok = hb_store_arweave:write_offset( IndexStore, TXID, true, @@ -120,7 +120,7 @@ maybe_index_ids(Block, Opts) -> {ok, {BundleIndex, HeaderSize}} -> _ = lists:foldl( fun({ItemID, Size}, ItemStartOffset) -> - hb_store_arweave:write_offset( + ok = hb_store_arweave:write_offset( IndexStore, hb_util:encode(ItemID), false, diff --git a/src/hb_opts.erl b/src/hb_opts.erl index 64fc34ec2..b334bb7e5 100644 --- a/src/hb_opts.erl +++ b/src/hb_opts.erl @@ -403,9 +403,7 @@ default_message() -> % Options: % - fallback: Fallback to the index page % - error: Return 404 Not Found - manifest_404 => fallback, - arweave_index_ids => true, - arweave_index_store => #{ <<"index-store">> => [?DEFAULT_PRIMARY_STORE] } + manifest_404 => fallback }. %% @doc Get an option from the global options, optionally overriding with a diff --git a/src/hb_store_arweave.erl b/src/hb_store_arweave.erl index 8dfbf087b..9720d121a 100644 --- a/src/hb_store_arweave.erl +++ b/src/hb_store_arweave.erl @@ -15,7 +15,7 @@ %% multiple times. type(StoreOpts, Key) -> case read(StoreOpts, Key) of - not_found -> not_found; + {error, not_found} -> not_found; {ok, _Data} -> % TODO: % - should this return composite for any index L1 bundles? From 62f107161dca68d1cca2516e4e5fa808f00c620c Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 29 Jan 2026 08:59:15 -0500 Subject: [PATCH 25/29] fix: allow hb_http:request to handle an {error, _} response --- src/hb_http.erl | 56 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/hb_http.erl b/src/hb_http.erl index 9dc6acc12..8f7f3cc2e 100644 --- a/src/hb_http.erl +++ b/src/hb_http.erl @@ -104,23 +104,34 @@ request(Method, Peer, Path, RawMessage, Opts) -> ), StartTime = os:system_time(millisecond), % Perform the HTTP request. - {_ErlStatus, Status, Headers, Body} = hb_http_client:request(Req, Opts), + Response = hb_http_client:request(Req, Opts), % Process the response. EndTime = os:system_time(millisecond), - ?event(http_outbound, - { - http_response, - {req, Req}, - {response, - #{ - status => Status, - headers => Headers, - body => Body - } - } - }, - Opts - ), + Duration = EndTime - StartTime, + case Response of + {_ErlStatus, Status, Headers, Body} -> + ?event(http_outbound, + { + http_response, + {req, Req}, + {response, + #{ + status => Status, + headers => Headers, + body => Body + } + } + }, + Opts + ), + request_response(Method, Peer, Path, Response, Duration, Opts); + Error -> + Error + end. + + +request_response(Method, Peer, Path, Response, Duration, Opts) -> + {_ErlStatus, Status, Headers, Body} = Response, % Convert the set-cookie headers into a cookie message, if they are present. % We do this by extracting the set-cookie headers and converting them into a % cookie message if they are present. @@ -161,7 +172,7 @@ request(Method, Peer, Path, RawMessage, Opts) -> ?event(http_short, {received, {status, Status}, - {duration, EndTime - StartTime}, + {duration, Duration}, {method, Method}, {peer, Peer}, {path, {string, Path}}, @@ -1246,3 +1257,16 @@ index_request_test() -> #{} ), ?assertEqual(<<"i like dogs!">>, hb_ao:get(<<"body">>, Res, #{})). + +request_error_handling_test() -> + Opts = #{}, + NonExistentDomain = <<"http://nonexistent.invalid:80">>, + Result = hb_http:request( + <<"GET">>, + NonExistentDomain, + <<"/">>, + #{}, + Opts + ), + % The result should be an error tuple, not crash with badmatch + ?assertMatch({error, _}, Result). From e94951b2047691bcafea3fdbcff1457b5f45758b Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 29 Jan 2026 14:52:08 -0500 Subject: [PATCH 26/29] wip: add some performance metrics (may remove later) enable `copycat_perf` to see metrics logged also written to `copycat_perf.csv` --- src/dev_copycat_arweave.erl | 396 +++++++++++++++++++++++++++--------- 1 file changed, 297 insertions(+), 99 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 71ed5602f..1cd1dae7a 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -11,6 +11,22 @@ -define(ARWEAVE_DEVICE, <<"~arweave@2.9-pre">>). +-record(perf, { + start_time = 0, %% monotonic start time (microseconds) + blocks_processed = 0, %% count (total) + resolve_block_time = 0, %% microseconds + resolve_tx_count = 0, %% count (total) + resolve_tx_time = 0, %% microseconds + items_indexed = 0, %% count (total) + write_offset_time = 0, %% microseconds + bundles_processed = 0, %% count (total) + download_bundle_header_time = 0, %% microseconds + last_block_items_indexed = 0, %% count (per-block) + last_block_total_txs = 0, %% count (per-block) + last_block_bundle_txs = 0, %% count (per-block) + last_block_skipped_txs = 0 %% count (per-block) +}). + % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave %% @doc Fetch blocks from an Arweave node between a given range, or from the @@ -18,7 +34,8 @@ %% fetch blocks from the latest known block towards the Genesis block. arweave(_Base, Request, Opts) -> {From, To} = parse_range(Request, Opts), - fetch_blocks(Request, From, To, Opts). + Perf = perf_init(), + fetch_blocks(Request, From, To, Perf, Opts). %% @doc Parse the range from the request. parse_range(Request, Opts) -> @@ -37,8 +54,123 @@ parse_range(Request, Opts) -> To = hb_maps:get(<<"to">>, Request, 0, Opts), {hb_util:int(From), hb_util:int(To)}. +%% @doc Initialize performance tracker at start of run. +perf_init() -> + #perf{start_time = erlang:monotonic_time(microsecond)}. + +%% @doc Calculate total elapsed time in microseconds. +perf_elapsed(#perf{start_time = Start}) -> + erlang:monotonic_time(microsecond) - Start. + +%% @doc Generic timing wrapper - updates perf record with timing data. +perf_time(Fun, Perf, TimeField) -> + {Time, Result} = timer:tc(Fun), + NewPerf = update_perf(Perf, TimeField, Time), + {Result, NewPerf}. + +%% @doc Update perf record with new timing data. +update_perf(Perf, resolve_block_time, Time) -> + Perf#perf{ + resolve_block_time = Perf#perf.resolve_block_time + Time + }; +update_perf(Perf, resolve_tx_time, Time) -> + Perf#perf{ + resolve_tx_time = Perf#perf.resolve_tx_time + Time, + resolve_tx_count = Perf#perf.resolve_tx_count + 1 + }; +update_perf(Perf, write_offset_time, Time) -> + Perf#perf{ + write_offset_time = Perf#perf.write_offset_time + Time + }; +update_perf(Perf, download_bundle_header_time, Time) -> + Perf#perf{ + download_bundle_header_time = Perf#perf.download_bundle_header_time + Time, + bundles_processed = Perf#perf.bundles_processed + 1 + }. + +%% @doc Calculate percentage of elapsed time that an operation took. +perf_pct(_OperationTime, 0) -> 0.0; +perf_pct(OperationTime, ElapsedTime) -> + (OperationTime / ElapsedTime) * 100.0. + +%% @doc Convert perf record to map for event logging (running totals). +perf_to_stats(Perf) -> + Elapsed = perf_elapsed(Perf), + #{ + blocks_processed => Perf#perf.blocks_processed, + resolve_block_time_us => Perf#perf.resolve_block_time, + resolve_block_pct => perf_pct(Perf#perf.resolve_block_time, Elapsed), + resolve_tx_count => Perf#perf.resolve_tx_count, + resolve_tx_time_us => Perf#perf.resolve_tx_time, + resolve_tx_pct => perf_pct(Perf#perf.resolve_tx_time, Elapsed), + items_indexed => Perf#perf.items_indexed, + write_offset_time_us => Perf#perf.write_offset_time, + write_offset_pct => perf_pct(Perf#perf.write_offset_time, Elapsed), + bundles_processed => Perf#perf.bundles_processed, + download_bundle_header_time_us => Perf#perf.download_bundle_header_time, + download_bundle_header_pct => perf_pct(Perf#perf.download_bundle_header_time, Elapsed), + elapsed_us => Elapsed + }. + +%% @doc Get CSV file path from options or use default. +perf_csv_path(Opts) -> + hb_opts:get(arweave_perf_csv_path, <<"copycat_perf.csv">>, Opts). + +%% @doc Write CSV header if file doesn't exist. +perf_csv_write_header(FilePath) -> + try + case filelib:is_file(FilePath) of + false -> + Header = "height,blocks_processed,resolve_block_time_us,resolve_block_pct," + "resolve_tx_count,resolve_tx_time_us,resolve_tx_pct," + "items_indexed,write_offset_time_us,write_offset_pct," + "bundles_processed,download_bundle_header_time_us,download_bundle_header_pct," + "elapsed_us\n", + file:write_file(FilePath, Header, [write]); + true -> + ok + end + catch + _:_ -> ok + end. + +%% @doc Format perf stats as CSV row. +perf_csv_format_row(Height, PerfStats) -> + BlocksProcessed = maps:get(blocks_processed, PerfStats), + ResolveBlockTime = maps:get(resolve_block_time_us, PerfStats), + ResolveBlockPct = maps:get(resolve_block_pct, PerfStats), + ResolveTxCount = maps:get(resolve_tx_count, PerfStats), + ResolveTxTime = maps:get(resolve_tx_time_us, PerfStats), + ResolveTxPct = maps:get(resolve_tx_pct, PerfStats), + ItemsIndexed = maps:get(items_indexed, PerfStats), + WriteOffsetTime = maps:get(write_offset_time_us, PerfStats), + WriteOffsetPct = maps:get(write_offset_pct, PerfStats), + BundlesProcessed = maps:get(bundles_processed, PerfStats), + DownloadBundleHeaderTime = maps:get(download_bundle_header_time_us, PerfStats), + DownloadBundleHeaderPct = maps:get(download_bundle_header_pct, PerfStats), + Elapsed = maps:get(elapsed_us, PerfStats), + io_lib:format("~w,~w,~w,~.2f,~w,~w,~.2f,~w,~w,~.2f,~w,~w,~.2f,~w\n", + [Height, BlocksProcessed, ResolveBlockTime, ResolveBlockPct, + ResolveTxCount, ResolveTxTime, ResolveTxPct, + ItemsIndexed, WriteOffsetTime, WriteOffsetPct, + BundlesProcessed, DownloadBundleHeaderTime, DownloadBundleHeaderPct, + Elapsed]). + +%% @doc Write perf stats to CSV file. +perf_csv_write(Height, PerfStats, Opts) -> + try + FilePath = perf_csv_path(Opts), + perf_csv_write_header(FilePath), + RowIoList = perf_csv_format_row(Height, PerfStats), + Row = iolist_to_binary(RowIoList), + file:write_file(FilePath, Row, [append]) + catch + _:_ -> ok + end. + + %% @doc Fetch blocks from an Arweave node between a given range. -fetch_blocks(Req, Current, To, _Opts) when Current < To -> +fetch_blocks(Req, Current, To, _Perf, _Opts) when Current < To -> ?event(copycat_arweave, {arweave_block_indexing_completed, {reached_target, To}, @@ -46,33 +178,51 @@ fetch_blocks(Req, Current, To, _Opts) when Current < To -> } ), {ok, To}; -fetch_blocks(Req, Current, To, Opts) -> - BlockRes = - hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/block=", - (hb_util:bin(Current))/binary - >>, - Opts - ), - process_block(BlockRes, Req, Current, To, Opts), - fetch_blocks(Req, Current - 1, To, Opts). +fetch_blocks(Req, Current, To, Perf, Opts) -> + {BlockRes, Perf1} = perf_time( + fun() -> + hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/block=", + (hb_util:bin(Current))/binary + >>, + Opts + ) + end, + Perf, + resolve_block_time + ), + Perf2 = Perf1#perf{blocks_processed = Perf1#perf.blocks_processed + 1}, + Perf3 = process_block(BlockRes, Req, Current, To, Perf2, Opts), + fetch_blocks(Req, Current - 1, To, Perf3, Opts). %% @doc Process a block. -process_block(BlockRes, _Req, Current, To, Opts) -> +process_block(BlockRes, _Req, Current, To, PerfAfterBlock, Opts) -> case BlockRes of {ok, Block} -> - {IndexedItems, SkippedTxs} = maybe_index_ids(Block, Opts), + PerfAfterIndex = maybe_index_ids(Block, PerfAfterBlock, Opts), + PerfStats = perf_to_stats(PerfAfterIndex), ?event( copycat_short, {arweave_block_cached, {height, Current}, - {indexed_items, IndexedItems}, - {skipped_txs, SkippedTxs}, + {items_indexed, PerfAfterIndex#perf.last_block_items_indexed}, + {total_txs, PerfAfterIndex#perf.last_block_total_txs}, + {bundle_txs, PerfAfterIndex#perf.last_block_bundle_txs}, + {skipped_txs, PerfAfterIndex#perf.last_block_skipped_txs}, {target, To} } - ); + ), + ?event( + copycat_perf, + {arweave_block_perf, + {height, Current}, + {perf, PerfStats} + } + ), + perf_csv_write(Current, PerfStats, Opts), + PerfAfterIndex; {error, _} = Error -> ?event( copycat_short, @@ -80,13 +230,20 @@ process_block(BlockRes, _Req, Current, To, Opts) -> {height, Current}, {target, To}, {reason, Error}} - ) + ), + PerfAfterBlock end. %% @doc Index the IDs of all transactions in the block if configured to do so. -maybe_index_ids(Block, Opts) -> +maybe_index_ids(Block, Perf, Opts) -> + TotalTXs = length(hb_maps:get(<<"txs">>, Block, [], Opts)), case hb_opts:get(arweave_index_ids, false, Opts) of - false -> {0, 0}; + false -> Perf#perf{ + last_block_items_indexed = 0, + last_block_total_txs = TotalTXs, + last_block_bundle_txs = 0, + last_block_skipped_txs = 0 + }; true -> IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), BlockEndOffset = hb_util:int( @@ -94,45 +251,61 @@ maybe_index_ids(Block, Opts) -> BlockSize = hb_util:int( hb_maps:get(<<"block_size">>, Block, 0, Opts)), BlockStartOffset = BlockEndOffset - BlockSize, - {TXs, SkippedFromHeaders} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), + {TXs, SkippedFromHeaders, Perf1} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Perf, Opts), + Perf2 = Perf1, Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - {IndexedItems, SkippedFromBundles} = lists:foldl(fun - ({{padding, _PaddingRoot}, _EndOffset}, {ItemsAcc, SkippedAcc}) -> - {ItemsAcc, SkippedAcc}; - ({{TX, _TXDataRoot}, EndOffset}, {ItemsAcc, SkippedAcc}) -> + ItemsBefore = Perf2#perf.items_indexed, + {Perf3, BundleTXs, SkippedFromBundles} = lists:foldl(fun + ({{padding, _PaddingRoot}, _EndOffset}, {PerfAcc, BundleAcc, SkippedAcc}) -> + {PerfAcc, BundleAcc, SkippedAcc}; + ({{TX, _TXDataRoot}, EndOffset}, {PerfAcc, BundleAcc, SkippedAcc}) -> case is_bundle_tx(TX, Opts) of - false -> {ItemsAcc, SkippedAcc}; + false -> {PerfAcc, BundleAcc, SkippedAcc}; true -> TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, - ok = hb_store_arweave:write_offset( - IndexStore, - TXID, - true, - TXStartOffset, - TX#tx.data_size + {ok, PerfAfterWrite} = perf_time( + fun() -> + hb_store_arweave:write_offset( + IndexStore, + TXID, + true, + TXStartOffset, + TX#tx.data_size + ) + end, + PerfAcc, + write_offset_time ), - case download_bundle_header( - TXEndOffset, TX#tx.data_size, Opts - ) of + {BundleRes, Perf4} = download_bundle_header( + TXEndOffset, TX#tx.data_size, PerfAfterWrite, Opts + ), + case BundleRes of {ok, {BundleIndex, HeaderSize}} -> - _ = lists:foldl( - fun({ItemID, Size}, ItemStartOffset) -> - ok = hb_store_arweave:write_offset( - IndexStore, - hb_util:encode(ItemID), - false, - ItemStartOffset, - Size + {_, Perf5} = lists:foldl( + fun({ItemID, Size}, {ItemStartOffset, PerfFold}) -> + {ok, PerfUpdated} = perf_time( + fun() -> + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + false, + ItemStartOffset, + Size + ) + end, + PerfFold, + write_offset_time ), - ItemStartOffset + Size + {ItemStartOffset + Size, PerfUpdated} end, - TXStartOffset + HeaderSize, + {TXStartOffset + HeaderSize, Perf4}, BundleIndex ), - {ItemsAcc + length(BundleIndex), SkippedAcc}; + Perf6 = Perf5#perf{items_indexed = Perf5#perf.items_indexed + length(BundleIndex)}, + {Perf6, BundleAcc + 1, SkippedAcc}; {error, Reason} -> ?event( copycat_short, @@ -141,45 +314,60 @@ maybe_index_ids(Block, Opts) -> {reason, Reason} } ), - {ItemsAcc, SkippedAcc + 1} + {Perf4, BundleAcc + 1, SkippedAcc + 1} end end end, - {0, 0}, + {Perf2, 0, 0}, TXsWithData ), - {IndexedItems, SkippedFromHeaders + SkippedFromBundles} + ItemsAfter = Perf3#perf.items_indexed, + ItemsIndexed = ItemsAfter - ItemsBefore, + SkippedTXs = SkippedFromHeaders + SkippedFromBundles, + Perf3#perf{ + last_block_items_indexed = ItemsIndexed, + last_block_total_txs = TotalTXs, + last_block_bundle_txs = BundleTXs, + last_block_skipped_txs = SkippedTXs + } end. is_bundle_tx(TX, _Opts) -> dev_arweave_common:type(TX) =/= binary. -download_bundle_header(EndOffset, Size, Opts) -> - StartOffset = EndOffset - Size + 1, - case hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/chunk&offset=", - (hb_util:bin(StartOffset))/binary - >>, - Opts - ) of - {ok, FirstChunk} -> - % Most bundle headers can fit in a single chunk, but those with - % thousands of items might require multiple chunks to fully - % represent the item index. - HeaderSize = ar_bundles:bundle_header_size(FirstChunk), - case header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) of - {ok, BundleHeader} -> - {_ItemsBin, BundleIndex} = - ar_bundles:decode_bundle_header(BundleHeader), - {ok, {BundleIndex, HeaderSize}}; +download_bundle_header(EndOffset, Size, Perf, Opts) -> + {Result, Perf1} = perf_time( + fun() -> + StartOffset = EndOffset - Size + 1, + case hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/chunk&offset=", + (hb_util:bin(StartOffset))/binary + >>, + Opts + ) of + {ok, FirstChunk} -> + % Most bundle headers can fit in a single chunk, but those with + % thousands of items might require multiple chunks to fully + % represent the item index. + HeaderSize = ar_bundles:bundle_header_size(FirstChunk), + case header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) of + {ok, BundleHeader} -> + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(BundleHeader), + {ok, {BundleIndex, HeaderSize}}; + Error -> + Error + end; Error -> Error - end; - Error -> - Error - end. + end + end, + Perf, + download_bundle_header_time + ), + {Result, Perf1}. header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) when HeaderSize =< byte_size(FirstChunk) -> @@ -196,36 +384,44 @@ header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> Opts ). -resolve_tx_headers(TXIDs, Opts) -> +resolve_tx_headers(TXIDs, Perf, Opts) -> lists:foldr( - fun(TXID, {Acc, SkippedAcc}) -> - case resolve_tx_header(TXID, Opts) of - {ok, TX} -> {[TX | Acc], SkippedAcc}; - skip -> {Acc, SkippedAcc + 1} + fun(TXID, {Acc, SkippedAcc, PerfAcc}) -> + {Res, PerfUpdated} = resolve_tx_header(TXID, PerfAcc, Opts), + case Res of + {ok, TX} -> {[TX | Acc], SkippedAcc, PerfUpdated}; + skip -> {Acc, SkippedAcc + 1, PerfUpdated} end end, - {[], 0}, + {[], 0, Perf}, TXIDs ). -resolve_tx_header(TXID, Opts) -> +resolve_tx_header(TXID, Perf, Opts) -> try - case hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/tx&tx=", - TXID/binary, - "&exclude-data=true" - >>, - Opts - ) of + {ResolveRes, Perf1} = perf_time( + fun() -> + hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/tx&tx=", + TXID/binary, + "&exclude-data=true" + >>, + Opts + ) + end, + Perf, + resolve_tx_time + ), + case ResolveRes of {ok, StructuredTXHeader} -> - {ok, + {{ok, hb_message:convert( StructuredTXHeader, <<"tx@1.0">>, <<"structured@1.0">>, - Opts)}; + Opts)}, Perf1}; {error, ResolveError} -> ?event( copycat_short, @@ -234,7 +430,7 @@ resolve_tx_header(TXID, Opts) -> {reason, ResolveError} } ), - skip + {skip, Perf1} end catch Class:Reason:_ -> @@ -246,7 +442,7 @@ resolve_tx_header(TXID, Opts) -> {reason, Reason} } ), - skip + {skip, Perf} end. @@ -331,8 +527,9 @@ bundle_header_index_test() -> OffsetMsg = hb_json:decode(OffsetBody), EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), - {ok, {BundleIndex, _HeaderSize}} = - download_bundle_header(EndOffset, Size, Opts), + Perf = perf_init(), + {{ok, {BundleIndex, _HeaderSize}}, _Perf1} = + download_bundle_header(EndOffset, Size, Perf, Opts), ?assertEqual(15000, length(BundleIndex)), ok. @@ -355,8 +552,9 @@ index_ids_ecdsa_test() -> non_string_tags_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - ?assertEqual(skip, - resolve_tx_header(<<"752P6t4cOjMabYHqzC6hyLhxyo4YKZLblg7va_J21YE">>, Opts)), + Perf = perf_init(), + {Res, _Perf1} = resolve_tx_header(<<"752P6t4cOjMabYHqzC6hyLhxyo4YKZLblg7va_J21YE">>, Perf, Opts), + ?assertEqual(skip, Res), ok. setup_index_opts() -> From 28e4f2448ca4d84d357283ac4e1bec1b1b214f9a Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 29 Jan 2026 20:26:32 -0500 Subject: [PATCH 27/29] wip: move dev_copycat_arweave performance tracking over to hb_event --- src/dev_copycat_arweave.erl | 411 +++++++++++------------------------- 1 file changed, 119 insertions(+), 292 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 1cd1dae7a..e8202c608 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -11,22 +11,6 @@ -define(ARWEAVE_DEVICE, <<"~arweave@2.9-pre">>). --record(perf, { - start_time = 0, %% monotonic start time (microseconds) - blocks_processed = 0, %% count (total) - resolve_block_time = 0, %% microseconds - resolve_tx_count = 0, %% count (total) - resolve_tx_time = 0, %% microseconds - items_indexed = 0, %% count (total) - write_offset_time = 0, %% microseconds - bundles_processed = 0, %% count (total) - download_bundle_header_time = 0, %% microseconds - last_block_items_indexed = 0, %% count (per-block) - last_block_total_txs = 0, %% count (per-block) - last_block_bundle_txs = 0, %% count (per-block) - last_block_skipped_txs = 0 %% count (per-block) -}). - % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave %% @doc Fetch blocks from an Arweave node between a given range, or from the @@ -34,8 +18,7 @@ %% fetch blocks from the latest known block towards the Genesis block. arweave(_Base, Request, Opts) -> {From, To} = parse_range(Request, Opts), - Perf = perf_init(), - fetch_blocks(Request, From, To, Perf, Opts). + fetch_blocks(Request, From, To, Opts). %% @doc Parse the range from the request. parse_range(Request, Opts) -> @@ -54,123 +37,10 @@ parse_range(Request, Opts) -> To = hb_maps:get(<<"to">>, Request, 0, Opts), {hb_util:int(From), hb_util:int(To)}. -%% @doc Initialize performance tracker at start of run. -perf_init() -> - #perf{start_time = erlang:monotonic_time(microsecond)}. - -%% @doc Calculate total elapsed time in microseconds. -perf_elapsed(#perf{start_time = Start}) -> - erlang:monotonic_time(microsecond) - Start. - -%% @doc Generic timing wrapper - updates perf record with timing data. -perf_time(Fun, Perf, TimeField) -> - {Time, Result} = timer:tc(Fun), - NewPerf = update_perf(Perf, TimeField, Time), - {Result, NewPerf}. - -%% @doc Update perf record with new timing data. -update_perf(Perf, resolve_block_time, Time) -> - Perf#perf{ - resolve_block_time = Perf#perf.resolve_block_time + Time - }; -update_perf(Perf, resolve_tx_time, Time) -> - Perf#perf{ - resolve_tx_time = Perf#perf.resolve_tx_time + Time, - resolve_tx_count = Perf#perf.resolve_tx_count + 1 - }; -update_perf(Perf, write_offset_time, Time) -> - Perf#perf{ - write_offset_time = Perf#perf.write_offset_time + Time - }; -update_perf(Perf, download_bundle_header_time, Time) -> - Perf#perf{ - download_bundle_header_time = Perf#perf.download_bundle_header_time + Time, - bundles_processed = Perf#perf.bundles_processed + 1 - }. - -%% @doc Calculate percentage of elapsed time that an operation took. -perf_pct(_OperationTime, 0) -> 0.0; -perf_pct(OperationTime, ElapsedTime) -> - (OperationTime / ElapsedTime) * 100.0. - -%% @doc Convert perf record to map for event logging (running totals). -perf_to_stats(Perf) -> - Elapsed = perf_elapsed(Perf), - #{ - blocks_processed => Perf#perf.blocks_processed, - resolve_block_time_us => Perf#perf.resolve_block_time, - resolve_block_pct => perf_pct(Perf#perf.resolve_block_time, Elapsed), - resolve_tx_count => Perf#perf.resolve_tx_count, - resolve_tx_time_us => Perf#perf.resolve_tx_time, - resolve_tx_pct => perf_pct(Perf#perf.resolve_tx_time, Elapsed), - items_indexed => Perf#perf.items_indexed, - write_offset_time_us => Perf#perf.write_offset_time, - write_offset_pct => perf_pct(Perf#perf.write_offset_time, Elapsed), - bundles_processed => Perf#perf.bundles_processed, - download_bundle_header_time_us => Perf#perf.download_bundle_header_time, - download_bundle_header_pct => perf_pct(Perf#perf.download_bundle_header_time, Elapsed), - elapsed_us => Elapsed - }. - -%% @doc Get CSV file path from options or use default. -perf_csv_path(Opts) -> - hb_opts:get(arweave_perf_csv_path, <<"copycat_perf.csv">>, Opts). - -%% @doc Write CSV header if file doesn't exist. -perf_csv_write_header(FilePath) -> - try - case filelib:is_file(FilePath) of - false -> - Header = "height,blocks_processed,resolve_block_time_us,resolve_block_pct," - "resolve_tx_count,resolve_tx_time_us,resolve_tx_pct," - "items_indexed,write_offset_time_us,write_offset_pct," - "bundles_processed,download_bundle_header_time_us,download_bundle_header_pct," - "elapsed_us\n", - file:write_file(FilePath, Header, [write]); - true -> - ok - end - catch - _:_ -> ok - end. - -%% @doc Format perf stats as CSV row. -perf_csv_format_row(Height, PerfStats) -> - BlocksProcessed = maps:get(blocks_processed, PerfStats), - ResolveBlockTime = maps:get(resolve_block_time_us, PerfStats), - ResolveBlockPct = maps:get(resolve_block_pct, PerfStats), - ResolveTxCount = maps:get(resolve_tx_count, PerfStats), - ResolveTxTime = maps:get(resolve_tx_time_us, PerfStats), - ResolveTxPct = maps:get(resolve_tx_pct, PerfStats), - ItemsIndexed = maps:get(items_indexed, PerfStats), - WriteOffsetTime = maps:get(write_offset_time_us, PerfStats), - WriteOffsetPct = maps:get(write_offset_pct, PerfStats), - BundlesProcessed = maps:get(bundles_processed, PerfStats), - DownloadBundleHeaderTime = maps:get(download_bundle_header_time_us, PerfStats), - DownloadBundleHeaderPct = maps:get(download_bundle_header_pct, PerfStats), - Elapsed = maps:get(elapsed_us, PerfStats), - io_lib:format("~w,~w,~w,~.2f,~w,~w,~.2f,~w,~w,~.2f,~w,~w,~.2f,~w\n", - [Height, BlocksProcessed, ResolveBlockTime, ResolveBlockPct, - ResolveTxCount, ResolveTxTime, ResolveTxPct, - ItemsIndexed, WriteOffsetTime, WriteOffsetPct, - BundlesProcessed, DownloadBundleHeaderTime, DownloadBundleHeaderPct, - Elapsed]). - -%% @doc Write perf stats to CSV file. -perf_csv_write(Height, PerfStats, Opts) -> - try - FilePath = perf_csv_path(Opts), - perf_csv_write_header(FilePath), - RowIoList = perf_csv_format_row(Height, PerfStats), - Row = iolist_to_binary(RowIoList), - file:write_file(FilePath, Row, [append]) - catch - _:_ -> ok - end. %% @doc Fetch blocks from an Arweave node between a given range. -fetch_blocks(Req, Current, To, _Perf, _Opts) when Current < To -> +fetch_blocks(Req, Current, To, _Opts) when Current < To -> ?event(copycat_arweave, {arweave_block_indexing_completed, {reached_target, To}, @@ -178,51 +48,36 @@ fetch_blocks(Req, Current, To, _Perf, _Opts) when Current < To -> } ), {ok, To}; -fetch_blocks(Req, Current, To, Perf, Opts) -> - {BlockRes, Perf1} = perf_time( - fun() -> - hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/block=", - (hb_util:bin(Current))/binary - >>, - Opts - ) - end, - Perf, - resolve_block_time - ), - Perf2 = Perf1#perf{blocks_processed = Perf1#perf.blocks_processed + 1}, - Perf3 = process_block(BlockRes, Req, Current, To, Perf2, Opts), - fetch_blocks(Req, Current - 1, To, Perf3, Opts). +fetch_blocks(Req, Current, To, Opts) -> + BlockRes = observe_event(<<"block">>, fun() -> + hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/block=", + (hb_util:bin(Current))/binary + >>, + Opts + ) + end), + process_block(BlockRes, Req, Current, To, Opts), + fetch_blocks(Req, Current - 1, To, Opts). %% @doc Process a block. -process_block(BlockRes, _Req, Current, To, PerfAfterBlock, Opts) -> +process_block(BlockRes, _Req, Current, To, Opts) -> case BlockRes of {ok, Block} -> - PerfAfterIndex = maybe_index_ids(Block, PerfAfterBlock, Opts), - PerfStats = perf_to_stats(PerfAfterIndex), + {ItemsIndexed, TotalTXs, BundleTXs, SkippedTXs} = maybe_index_ids(Block, Opts), ?event( copycat_short, {arweave_block_cached, {height, Current}, - {items_indexed, PerfAfterIndex#perf.last_block_items_indexed}, - {total_txs, PerfAfterIndex#perf.last_block_total_txs}, - {bundle_txs, PerfAfterIndex#perf.last_block_bundle_txs}, - {skipped_txs, PerfAfterIndex#perf.last_block_skipped_txs}, + {items_indexed, ItemsIndexed}, + {total_txs, TotalTXs}, + {bundle_txs, BundleTXs}, + {skipped_txs, SkippedTXs}, {target, To} } - ), - ?event( - copycat_perf, - {arweave_block_perf, - {height, Current}, - {perf, PerfStats} - } - ), - perf_csv_write(Current, PerfStats, Opts), - PerfAfterIndex; + ); {error, _} = Error -> ?event( copycat_short, @@ -230,20 +85,14 @@ process_block(BlockRes, _Req, Current, To, PerfAfterBlock, Opts) -> {height, Current}, {target, To}, {reason, Error}} - ), - PerfAfterBlock + ) end. %% @doc Index the IDs of all transactions in the block if configured to do so. -maybe_index_ids(Block, Perf, Opts) -> +maybe_index_ids(Block, Opts) -> TotalTXs = length(hb_maps:get(<<"txs">>, Block, [], Opts)), case hb_opts:get(arweave_index_ids, false, Opts) of - false -> Perf#perf{ - last_block_items_indexed = 0, - last_block_total_txs = TotalTXs, - last_block_bundle_txs = 0, - last_block_skipped_txs = 0 - }; + false -> {0, TotalTXs, 0, 0}; true -> IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), BlockEndOffset = hb_util:int( @@ -251,61 +100,50 @@ maybe_index_ids(Block, Perf, Opts) -> BlockSize = hb_util:int( hb_maps:get(<<"block_size">>, Block, 0, Opts)), BlockStartOffset = BlockEndOffset - BlockSize, - {TXs, SkippedFromHeaders, Perf1} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Perf, Opts), - Perf2 = Perf1, + {TXs, SkippedFromHeaders} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - ItemsBefore = Perf2#perf.items_indexed, - {Perf3, BundleTXs, SkippedFromBundles} = lists:foldl(fun - ({{padding, _PaddingRoot}, _EndOffset}, {PerfAcc, BundleAcc, SkippedAcc}) -> - {PerfAcc, BundleAcc, SkippedAcc}; - ({{TX, _TXDataRoot}, EndOffset}, {PerfAcc, BundleAcc, SkippedAcc}) -> + {ItemsIndexed, BundleTXs, SkippedFromBundles} = lists:foldl(fun + ({{padding, _PaddingRoot}, _EndOffset}, {ItemsAcc, BundleAcc, SkippedAcc}) -> + {ItemsAcc, BundleAcc, SkippedAcc}; + ({{TX, _TXDataRoot}, EndOffset}, {ItemsAcc, BundleAcc, SkippedAcc}) -> case is_bundle_tx(TX, Opts) of - false -> {PerfAcc, BundleAcc, SkippedAcc}; + false -> {ItemsAcc, BundleAcc, SkippedAcc}; true -> TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, - {ok, PerfAfterWrite} = perf_time( - fun() -> - hb_store_arweave:write_offset( - IndexStore, - TXID, - true, - TXStartOffset, - TX#tx.data_size - ) - end, - PerfAcc, - write_offset_time - ), - {BundleRes, Perf4} = download_bundle_header( - TXEndOffset, TX#tx.data_size, PerfAfterWrite, Opts + observe_event(<<"item_indexed">>, fun() -> + hb_store_arweave:write_offset( + IndexStore, + TXID, + true, + TXStartOffset, + TX#tx.data_size + ) + end), + BundleRes = download_bundle_header( + TXEndOffset, TX#tx.data_size, Opts ), case BundleRes of {ok, {BundleIndex, HeaderSize}} -> - {_, Perf5} = lists:foldl( - fun({ItemID, Size}, {ItemStartOffset, PerfFold}) -> - {ok, PerfUpdated} = perf_time( - fun() -> - hb_store_arweave:write_offset( - IndexStore, - hb_util:encode(ItemID), - false, - ItemStartOffset, - Size - ) - end, - PerfFold, - write_offset_time - ), - {ItemStartOffset + Size, PerfUpdated} + {_, ItemsCount} = lists:foldl( + fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> + observe_event(<<"item_indexed">>, fun() -> + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + false, + ItemStartOffset, + Size + ) + end), + {ItemStartOffset + Size, ItemsCountAcc + 1} end, - {TXStartOffset + HeaderSize, Perf4}, + {TXStartOffset + HeaderSize, 0}, BundleIndex ), - Perf6 = Perf5#perf{items_indexed = Perf5#perf.items_indexed + length(BundleIndex)}, - {Perf6, BundleAcc + 1, SkippedAcc}; + {ItemsAcc + ItemsCount, BundleAcc + 1, SkippedAcc}; {error, Reason} -> ?event( copycat_short, @@ -314,60 +152,48 @@ maybe_index_ids(Block, Perf, Opts) -> {reason, Reason} } ), - {Perf4, BundleAcc + 1, SkippedAcc + 1} + {ItemsAcc, BundleAcc + 1, SkippedAcc + 1} end end end, - {Perf2, 0, 0}, + {0, 0, 0}, TXsWithData ), - ItemsAfter = Perf3#perf.items_indexed, - ItemsIndexed = ItemsAfter - ItemsBefore, SkippedTXs = SkippedFromHeaders + SkippedFromBundles, - Perf3#perf{ - last_block_items_indexed = ItemsIndexed, - last_block_total_txs = TotalTXs, - last_block_bundle_txs = BundleTXs, - last_block_skipped_txs = SkippedTXs - } + {ItemsIndexed, TotalTXs, BundleTXs, SkippedTXs} end. is_bundle_tx(TX, _Opts) -> dev_arweave_common:type(TX) =/= binary. -download_bundle_header(EndOffset, Size, Perf, Opts) -> - {Result, Perf1} = perf_time( - fun() -> - StartOffset = EndOffset - Size + 1, - case hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/chunk&offset=", - (hb_util:bin(StartOffset))/binary - >>, - Opts - ) of - {ok, FirstChunk} -> - % Most bundle headers can fit in a single chunk, but those with - % thousands of items might require multiple chunks to fully - % represent the item index. - HeaderSize = ar_bundles:bundle_header_size(FirstChunk), - case header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) of - {ok, BundleHeader} -> - {_ItemsBin, BundleIndex} = - ar_bundles:decode_bundle_header(BundleHeader), - {ok, {BundleIndex, HeaderSize}}; - Error -> - Error - end; - Error -> - Error - end - end, - Perf, - download_bundle_header_time - ), - {Result, Perf1}. +download_bundle_header(EndOffset, Size, Opts) -> + observe_event(<<"bundle_header">>, fun() -> + StartOffset = EndOffset - Size + 1, + case hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/chunk&offset=", + (hb_util:bin(StartOffset))/binary + >>, + Opts + ) of + {ok, FirstChunk} -> + % Most bundle headers can fit in a single chunk, but those with + % thousands of items might require multiple chunks to fully + % represent the item index. + HeaderSize = ar_bundles:bundle_header_size(FirstChunk), + case header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) of + {ok, BundleHeader} -> + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(BundleHeader), + {ok, {BundleIndex, HeaderSize}}; + Error -> + Error + end; + Error -> + Error + end + end). header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) when HeaderSize =< byte_size(FirstChunk) -> @@ -384,44 +210,40 @@ header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> Opts ). -resolve_tx_headers(TXIDs, Perf, Opts) -> +resolve_tx_headers(TXIDs, Opts) -> lists:foldr( - fun(TXID, {Acc, SkippedAcc, PerfAcc}) -> - {Res, PerfUpdated} = resolve_tx_header(TXID, PerfAcc, Opts), + fun(TXID, {Acc, SkippedAcc}) -> + Res = resolve_tx_header(TXID, Opts), case Res of - {ok, TX} -> {[TX | Acc], SkippedAcc, PerfUpdated}; - skip -> {Acc, SkippedAcc + 1, PerfUpdated} + {ok, TX} -> {[TX | Acc], SkippedAcc}; + skip -> {Acc, SkippedAcc + 1} end end, - {[], 0, Perf}, + {[], 0}, TXIDs ). -resolve_tx_header(TXID, Perf, Opts) -> +resolve_tx_header(TXID, Opts) -> try - {ResolveRes, Perf1} = perf_time( - fun() -> - hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/tx&tx=", - TXID/binary, - "&exclude-data=true" - >>, - Opts - ) - end, - Perf, - resolve_tx_time - ), + ResolveRes = observe_event(<<"tx_header">>, fun() -> + hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/tx&tx=", + TXID/binary, + "&exclude-data=true" + >>, + Opts + ) + end), case ResolveRes of {ok, StructuredTXHeader} -> - {{ok, + {ok, hb_message:convert( StructuredTXHeader, <<"tx@1.0">>, <<"structured@1.0">>, - Opts)}, Perf1}; + Opts)}; {error, ResolveError} -> ?event( copycat_short, @@ -430,7 +252,7 @@ resolve_tx_header(TXID, Perf, Opts) -> {reason, ResolveError} } ), - {skip, Perf1} + skip end catch Class:Reason:_ -> @@ -442,9 +264,16 @@ resolve_tx_header(TXID, Perf, Opts) -> {reason, Reason} } ), - {skip, Perf} + skip end. +%% @doc Track an operation's execution time and count using hb_event:increment. +%% Always tracks both count and duration, regardless of success/failure. +observe_event(MetricName, Fun) -> + {Time, Result} = timer:tc(Fun), + hb_event:increment(<<"arweave_block_count">>, MetricName, #{}, 1), + hb_event:increment(<<"arweave_block_duration">>, MetricName, #{}, Time), + Result. %%% Tests @@ -527,9 +356,8 @@ bundle_header_index_test() -> OffsetMsg = hb_json:decode(OffsetBody), EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), - Perf = perf_init(), - {{ok, {BundleIndex, _HeaderSize}}, _Perf1} = - download_bundle_header(EndOffset, Size, Perf, Opts), + {ok, {BundleIndex, _HeaderSize}} = + download_bundle_header(EndOffset, Size, Opts), ?assertEqual(15000, length(BundleIndex)), ok. @@ -552,8 +380,7 @@ index_ids_ecdsa_test() -> non_string_tags_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - Perf = perf_init(), - {Res, _Perf1} = resolve_tx_header(<<"752P6t4cOjMabYHqzC6hyLhxyo4YKZLblg7va_J21YE">>, Perf, Opts), + Res = resolve_tx_header(<<"752P6t4cOjMabYHqzC6hyLhxyo4YKZLblg7va_J21YE">>, Opts), ?assertEqual(skip, Res), ok. From 1caeaeb1aa00a47a29f8716bbba51197bf07a030 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 29 Jan 2026 20:51:49 -0500 Subject: [PATCH 28/29] wip: add metric for full block processing --- src/dev_copycat_arweave.erl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index e8202c608..1c9daf3c2 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -7,7 +7,6 @@ -export([arweave/3]). -include_lib("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). --include_lib("public_key/include/OTP-PUB-KEY.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9-pre">>). @@ -49,7 +48,13 @@ fetch_blocks(Req, Current, To, _Opts) when Current < To -> ), {ok, To}; fetch_blocks(Req, Current, To, Opts) -> - BlockRes = observe_event(<<"block">>, fun() -> + observe_event(<<"block_indexed">>, fun() -> + fetch_and_process_block(Current, To, Opts) + end), + fetch_blocks(Req, Current - 1, To, Opts). + +fetch_and_process_block(Current, To, Opts) -> + BlockRes = observe_event(<<"block_header">>, fun() -> hb_ao:resolve( << ?ARWEAVE_DEVICE/binary, @@ -59,11 +64,10 @@ fetch_blocks(Req, Current, To, Opts) -> Opts ) end), - process_block(BlockRes, Req, Current, To, Opts), - fetch_blocks(Req, Current - 1, To, Opts). + process_block(BlockRes, Current, To, Opts). %% @doc Process a block. -process_block(BlockRes, _Req, Current, To, Opts) -> +process_block(BlockRes, Current, To, Opts) -> case BlockRes of {ok, Block} -> {ItemsIndexed, TotalTXs, BundleTXs, SkippedTXs} = maybe_index_ids(Block, Opts), From a14c38dc031d9b815147d0ff8ad5f254a1691fa1 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 30 Jan 2026 21:52:28 +0000 Subject: [PATCH 29/29] impr: parallelize arweave indexing configure with arweave_index_workers --- src/dev_copycat_arweave.erl | 209 +++++++++++++++++++++++++----------- src/hb_store_lmdb.erl | 2 +- 2 files changed, 150 insertions(+), 61 deletions(-) diff --git a/src/dev_copycat_arweave.erl b/src/dev_copycat_arweave.erl index 1c9daf3c2..4e5b820d7 100644 --- a/src/dev_copycat_arweave.erl +++ b/src/dev_copycat_arweave.erl @@ -98,7 +98,6 @@ maybe_index_ids(Block, Opts) -> case hb_opts:get(arweave_index_ids, false, Opts) of false -> {0, TotalTXs, 0, 0}; true -> - IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), BlockEndOffset = hb_util:int( hb_maps:get(<<"weave_size">>, Block, 0, Opts)), BlockSize = hb_util:int( @@ -107,66 +106,148 @@ maybe_index_ids(Block, Opts) -> {TXs, SkippedFromHeaders} = resolve_tx_headers(hb_maps:get(<<"txs">>, Block, [], Opts), Opts), Height = hb_maps:get(<<"height">>, Block, 0, Opts), TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - {ItemsIndexed, BundleTXs, SkippedFromBundles} = lists:foldl(fun - ({{padding, _PaddingRoot}, _EndOffset}, {ItemsAcc, BundleAcc, SkippedAcc}) -> - {ItemsAcc, BundleAcc, SkippedAcc}; - ({{TX, _TXDataRoot}, EndOffset}, {ItemsAcc, BundleAcc, SkippedAcc}) -> - case is_bundle_tx(TX, Opts) of - false -> {ItemsAcc, BundleAcc, SkippedAcc}; - true -> - TXID = hb_util:encode(TX#tx.id), - TXEndOffset = BlockStartOffset + EndOffset, - TXStartOffset = TXEndOffset - TX#tx.data_size, - observe_event(<<"item_indexed">>, fun() -> - hb_store_arweave:write_offset( - IndexStore, - TXID, - true, - TXStartOffset, - TX#tx.data_size - ) - end), - BundleRes = download_bundle_header( - TXEndOffset, TX#tx.data_size, Opts - ), - case BundleRes of - {ok, {BundleIndex, HeaderSize}} -> - {_, ItemsCount} = lists:foldl( - fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> - observe_event(<<"item_indexed">>, fun() -> - hb_store_arweave:write_offset( - IndexStore, - hb_util:encode(ItemID), - false, - ItemStartOffset, - Size - ) - end), - {ItemStartOffset + Size, ItemsCountAcc + 1} - end, - {TXStartOffset + HeaderSize, 0}, - BundleIndex - ), - {ItemsAcc + ItemsCount, BundleAcc + 1, SkippedAcc}; - {error, Reason} -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, TXID}}, - {reason, Reason} - } - ), - {ItemsAcc, BundleAcc + 1, SkippedAcc + 1} - end - end - end, - {0, 0, 0}, + % Filter out padding entries before processing + ValidTXs = lists:filter( + fun({{padding, _}, _}) -> false; (_) -> true end, TXsWithData ), + {ItemsIndexed, BundleTXs, SkippedFromBundles} = + process_txs(ValidTXs, BlockStartOffset, Opts), SkippedTXs = SkippedFromHeaders + SkippedFromBundles, {ItemsIndexed, TotalTXs, BundleTXs, SkippedTXs} end. +%% @doc Apply Fun to each item in Items with parallel workers. +%% Fun takes an item and returns a result. +%% Returns a list of results in the same order as the input items. +%% Uses arweave_index_workers from Opts to determine max concurrency (default 1 = sequential). +parallel_map(Items, Fun, Opts) -> + MaxWorkers = max(1, hb_opts:get(arweave_index_workers, 1, Opts)), + Parent = self(), + ItemsWithRefs = [{Item, make_ref()} || Item <- Items], + % Spawn initial batch up to MaxWorkers + {ToSpawn, Remaining} = lists:split(min(length(ItemsWithRefs), MaxWorkers), ItemsWithRefs), + ActiveRefs = [spawn_worker(ItemWithRef, Fun, Parent) || ItemWithRef <- ToSpawn], + % Wait for workers to complete and refill pool, collecting results + ResultsMap = parallel_map_wait(ActiveRefs, Remaining, Fun, MaxWorkers, Parent, #{}), + % Return results in order by matching refs (inspired by pmap pattern) + [maps:get(Ref, ResultsMap) || {_Item, Ref} <- ItemsWithRefs]. + +%% @doc Spawn a worker process for a single item. +spawn_worker({Item, Ref}, Fun, Parent) -> + spawn(fun() -> + Result = Fun(Item), + Parent ! {pmap_work, Ref, Result} + end), + Ref. + +%% @doc Wait for workers to complete and refill the pool as slots become available. +parallel_map_wait([], [], _Fun, _MaxWorkers, _Parent, ResultsMap) -> + ResultsMap; +parallel_map_wait(ActiveRefs, Remaining, Fun, MaxWorkers, Parent, ResultsMap) -> + receive + {pmap_work, CompletedRef, Result} -> + % Store result and remove completed ref + NewResultsMap = ResultsMap#{CompletedRef => Result}, + NewActiveRefs = lists:delete(CompletedRef, ActiveRefs), + case Remaining of + [] -> + % No more items, just wait for remaining workers + parallel_map_wait(NewActiveRefs, [], Fun, MaxWorkers, Parent, NewResultsMap); + _ -> + % Spawn replacement worker + [NextItemWithRef | NewRemaining] = Remaining, + NextRef = spawn_worker(NextItemWithRef, Fun, Parent), + parallel_map_wait([NextRef | NewActiveRefs], NewRemaining, Fun, MaxWorkers, Parent, NewResultsMap) + end + end. + +%% @doc Process a single transaction and return its contribution to the counters. +%% Returns a map with keys: items_count, bundle_count, skipped_count +process_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, _Opts) -> + #{items_count => 0, bundle_count => 0, skipped_count => 0}; +process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> + case is_bundle_tx(TX, Opts) of + false -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; + true -> + IndexStore = hb_opts:get(arweave_index_store, no_store, Opts), + TXID = hb_util:encode(TX#tx.id), + TXEndOffset = BlockStartOffset + EndOffset, + TXStartOffset = TXEndOffset - TX#tx.data_size, + observe_event(<<"item_indexed">>, fun() -> + hb_store_arweave:write_offset( + IndexStore, + TXID, + true, + TXStartOffset, + TX#tx.data_size + ) + end), + BundleRes = download_bundle_header( + TXEndOffset, TX#tx.data_size, Opts + ), + case BundleRes of + {ok, {BundleIndex, HeaderSize}} -> + % Batch event tracking: measure total time and count for all write_offset calls + {TotalTime, {_, ItemsCount}} = timer:tc(fun() -> + lists:foldl( + fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + false, + ItemStartOffset, + Size + ), + {ItemStartOffset + Size, ItemsCountAcc + 1} + end, + {TXStartOffset + HeaderSize, 0}, + BundleIndex + ) + end), + % Single event increment for the batch + record_event_metrics(<<"item_indexed">>, ItemsCount, TotalTime), + #{items_count => ItemsCount, bundle_count => 1, skipped_count => 0}; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, TXID}}, + {reason, Reason} + } + ), + #{items_count => 0, bundle_count => 1, skipped_count => 1} + end + end. + +%% @doc Process transactions: spawn workers and manage the worker pool. +%% This function processes transactions in parallel using parallel_map. +%% When arweave_index_workers <= 1, processes sequentially (one worker at a time). +%% When arweave_index_workers > 1, processes in parallel with the specified concurrency limit. +%% Returns {ItemsIndexed, BundleTXs, SkippedTXs}. +process_txs(ValidTXs, BlockStartOffset, Opts) -> + Results = parallel_map( + ValidTXs, + fun(TXWithData) -> process_tx(TXWithData, BlockStartOffset, Opts) end, + Opts + ), + Aggregated = lists:foldl( + fun(Result, Acc) -> + #{ + items_count => maps:get(items_count, Result, 0) + maps:get(items_count, Acc, 0), + bundle_count => maps:get(bundle_count, Result, 0) + maps:get(bundle_count, Acc, 0), + skipped_count => maps:get(skipped_count, Result, 0) + maps:get(skipped_count, Acc, 0) + } + end, + #{items_count => 0, bundle_count => 0, skipped_count => 0}, + Results + ), + { + maps:get(items_count, Aggregated, 0), + maps:get(bundle_count, Aggregated, 0), + maps:get(skipped_count, Aggregated, 0) + }. + is_bundle_tx(TX, _Opts) -> dev_arweave_common:type(TX) =/= binary. @@ -215,16 +296,20 @@ header_chunk(HeaderSize, _FirstChunk, StartOffset, Opts) -> ). resolve_tx_headers(TXIDs, Opts) -> + Results = parallel_map( + TXIDs, + fun(TXID) -> resolve_tx_header(TXID, Opts) end, + Opts + ), lists:foldr( - fun(TXID, {Acc, SkippedAcc}) -> - Res = resolve_tx_header(TXID, Opts), + fun(Res, {Acc, SkippedAcc}) -> case Res of {ok, TX} -> {[TX | Acc], SkippedAcc}; skip -> {Acc, SkippedAcc + 1} end end, {[], 0}, - TXIDs + Results ). resolve_tx_header(TXID, Opts) -> @@ -271,12 +356,16 @@ resolve_tx_header(TXID, Opts) -> skip end. +%% @doc Record event metrics (count and duration) using hb_event:increment. +record_event_metrics(MetricName, Count, Duration) -> + hb_event:increment(<<"arweave_block_count">>, MetricName, #{}, Count), + hb_event:increment(<<"arweave_block_duration">>, MetricName, #{}, Duration). + %% @doc Track an operation's execution time and count using hb_event:increment. %% Always tracks both count and duration, regardless of success/failure. observe_event(MetricName, Fun) -> {Time, Result} = timer:tc(Fun), - hb_event:increment(<<"arweave_block_count">>, MetricName, #{}, 1), - hb_event:increment(<<"arweave_block_duration">>, MetricName, #{}, Time), + record_event_metrics(MetricName, 1, Time), Result. %%% Tests diff --git a/src/hb_store_lmdb.erl b/src/hb_store_lmdb.erl index 0a970513c..0e72bff66 100644 --- a/src/hb_store_lmdb.erl +++ b/src/hb_store_lmdb.erl @@ -30,7 +30,7 @@ -include("include/hb.hrl"). %% Configuration constants with reasonable defaults --define(DEFAULT_SIZE, 16 * 1024 * 1024 * 1024). % 16GB default database size +-define(DEFAULT_SIZE, 2 * 1024 * 1024 * 1024 * 1024). % 16GB default database size -define(CONNECT_TIMEOUT, 6000). % Timeout for server communication -define(DEFAULT_IDLE_FLUSH_TIME, 5). % Idle server time before auto-flush -define(DEFAULT_MAX_FLUSH_TIME, 50). % Maximum time between flushes