diff --git a/.github/settings.yml b/.github/settings.yml new file mode 100644 index 0000000..2b2979e --- /dev/null +++ b/.github/settings.yml @@ -0,0 +1,46 @@ +# These settings are synced to GitHub by https://probot.github.io/apps/settings/ +_extends: .github + +teams: + - name: devs + permission: push + - name: bots + permission: push + - name: maintainers + permission: push + - name: erlang + permission: maintain + +branches: + - name: "master" + # https://developer.github.com/v3/repos/branches/#update-branch-protection + # Branch Protection settings. Set to null to disable + protection: + required_pull_request_reviews: + required_approving_review_count: 1 + dismiss_stale_reviews: false + require_code_owner_reviews: true + dismissal_restrictions: {} + required_status_checks: + strict: true + checks: + - context: "Load .env" + app_id: 15368 + - context: "Run checks / Build" + app_id: 15368 + - context: "Run checks / Check" + app_id: 15368 + - context: "Run checks / Dialyze" + app_id: 15368 + - context: "Run checks / Test" + app_id: 15368 + - context: "codecov/patch" + app_id: 254 + - context: "codecov/project" + app_id: 254 + enforce_admins: false + required_conversation_resolution: false + required_linear_history: true + restrictions: null + allow_force_pushes: false + allow_deletions: false diff --git a/.github/workflows/basic-linters.yml b/.github/workflows/basic-linters.yml new file mode 100644 index 0000000..6114f14 --- /dev/null +++ b/.github/workflows/basic-linters.yml @@ -0,0 +1,10 @@ +name: Vality basic linters + +on: + pull_request: + branches: + - "*" + +jobs: + lint: + uses: valitydev/base-workflows/.github/workflows/basic-linters.yml@v1 diff --git a/.github/workflows/erlang-checks.yml b/.github/workflows/erlang-checks.yml index d701276..f29549f 100644 --- a/.github/workflows/erlang-checks.yml +++ b/.github/workflows/erlang-checks.yml @@ -18,7 +18,7 @@ jobs: thrift-version: ${{ steps.thrift-version.outputs.version }} steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 - run: grep -v '^#' .env >> $GITHUB_ENV - id: otp-version run: echo "::set-output name=version::$OTP_VERSION" @@ -30,9 +30,10 @@ jobs: run: name: Run checks needs: setup - uses: valitydev/erlang-workflows/.github/workflows/erlang-parallel-build.yml@v1.0.9 + uses: valitydev/erlang-workflows/.github/workflows/erlang-parallel-build.yml@v1.0.10 with: otp-version: ${{ needs.setup.outputs.otp-version }} rebar-version: ${{ needs.setup.outputs.rebar-version }} use-thrift: true thrift-version: ${{ needs.setup.outputs.thrift-version }} + upload-coverage: false diff --git a/LICENSE b/LICENSE index 2bb9ad2..d9a10c0 100644 --- a/LICENSE +++ b/LICENSE @@ -173,4 +173,4 @@ incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. - END OF TERMS AND CONDITIONS \ No newline at end of file + END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md index 318a622..2c7f8b4 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,80 @@ check_loan_limits(Limits, Context, N) -> ### Woody Event Handler Интерфейс для получения и логирования событий RPC библиотеки. Также содержит вспомогательные функции для удобного форматирования событий. Пример реализации _event handler_'а - [woody_event_handler_default.erl](src/woody_event_handler_default.erl). +Через опции обработчика можно сообщить параметры соответствия событий RPC для уровня логирования: + +``` erlang +woody_event_handler_default:handle_event(Event, RpcId, Meta, #{ + formatter_opts => ..., + events_severity => #{ + ['call service'] => debug, + ... + } +}). +``` + +Где эти параметры имеют значения по умолчанию в следующем виде: + +``` erlang +#{ + events_severity => #{ + %% Пограничные события работы клиента + ['client begin'] => debug, + ['client end'] => debug, + + %% Начало вызова сервиса, перед формированием запроса + ['call service'] => info, + + %% Результат вызова сервиса на клиенте + ['service result'] => info, + ['service result', error] => error, + %% Событие состоявшегося вызова с возвращённой ошибкой в качестве + %% результата + ['service result', warning] => warning, + + %% Клиентские события, включая обнаружения хоста + ['client send'] => debug, + ['client resolve begin'] => debug, + ['client resolve result'] => debug, + ['client receive'] => debug, + ['client receive', error] => warning, + + %% Непосредственные события обслуживания запроса сервером + ['server receive'] => debug, + ['server send'] => debug, + ['server send', error] => warning, + + %% Начало обслуживания вызова функции сервиса + ['invoke service handler'] => info, + + %% Завершение обслуживание вызова с разным итогом + ['service handler result'] => info, + ['service handler result', error, business] => info, + ['service handler result', error, system] => error, + %% Обслуживание вызова завершилось поправимой ошибкой; + %% по крайней мере она не в рамках бизнес-логики но и не системное + %% исключение + ['service handler result', warning] => warning, + + %% События кеширующей обертки клиента + ['client cache begin'] => debug, + ['client cache end'] => debug, + ['client cache hit'] => info, + ['client cache miss'] => debug, + ['client cache update'] => debug, + ['client cache result'] => debug, + + %% Внутренние ошибки с разным контекстом/происхождением + ['internal error', system] => error, + ['internal error', business] => warning, + %% Событие трассировки на уровне woody, см. далее + + ['trace event'] => debug + } +}. +``` + + ### Tracing Можно динамически включать и выключать трассировку http запросов и ответов. @@ -176,3 +250,36 @@ check_loan_limits(Limits, Context, N) -> application:set_env(woody, trace_http_server, true). application:unset_env(woody, trace_http_server). ``` + +### Prometheus metrics + +Чтобы осуществлять экспорт метрик следует добавить [соответствующий хэндлер](https://github.com/deadtrickster/prometheus-cowboy#exporting-metrics-with-handlers) для cowboy-сервера. + +``` erlang +{deps, [ + ... + {prometheus_cowboy, "0.1.8"} +]} +``` + +Для сбора серверных метрик необходимо на старте приложения объявить их + +``` erlang +ok = woody_ranch_prometheus_collector:setup() +``` + +Если дополнительно интересуют все метрики ковбоя то можно добавить [реализацию обсервера из библиотеки](https://github.com/deadtrickster/prometheus-cowboy?tab=readme-ov-file#exporting-cowboy2-metrics). + +Для сбора клиентских метрик необходимо на старте приложения объявить их + +``` erlang +ok = woody_hackney_prometheus_collector:setup() +``` + +Это будет публиковать целочисленные значения в шкале 'woody_hackney_pool_usage' с метками `pool` в качестве названия пула и `status` в качестве параметра соответствующего значения: + +- `in_use_count` -- используемые соединения в пуле; +- `free_count` -- свободное количество в пуле; +- `queue_count` -- очередь за свободными соединенеиями + +**TODO** Возможно стоит рассмотреть публикацию метрик по количеству исполняемых запросов в общем, с разбивкой по хосту и количества новых и переиспользуемых соедининий в каждом из пулов. [Хакни это предоставляет](https://github.com/benoitc/hackney/tree/1.18.0#metrics). diff --git a/rebar.config b/rebar.config index 6629b6d..8557a12 100644 --- a/rebar.config +++ b/rebar.config @@ -31,7 +31,9 @@ {gproc, "0.9.0"}, {cache, "2.3.3"}, {snowflake, {git, "https://github.com/valitydev/snowflake.git", {branch, "master"}}}, - {genlib, {git, "https://github.com/valitydev/genlib.git", {branch, "master"}}} + {genlib, {git, "https://github.com/valitydev/genlib.git", {branch, "master"}}}, + {prometheus, "4.8.1"}, + {opentelemetry_api, "1.2.1"} ]}. %% XRef checks @@ -102,17 +104,15 @@ {thrift, {git, "https://github.com/valitydev/thrift_erlang.git", {ref, "3f3e11246d90aefa8f58b35e4f2eab14c0c28bd2"}}}, - {woody_api_hay, - {git, "https://github.com/valitydev/woody_api_hay.git", - {ref, "4c39134cddaa9bf6fb8db18e7030ae64f1efb3a9"}}}, {damsel, {git, "https://github.com/valitydev/damsel.git", {ref, "3fa6f31db54b2ae781b27898ab4daf56bb36eb36"}}}, {mg_proto, {git, "https://github.com/valitydev/machinegun-proto.git", - {ref, "ebae56fe2b3e79e4eb34afc8cb55c9012ae989f8"}}} + {ref, "ebae56fe2b3e79e4eb34afc8cb55c9012ae989f8"}}}, + {opentelemetry, "1.3.0"} ]}, {dialyzer, [ - {plt_extra_apps, [thrift, how_are_you, eunit, proper, common_test, cth_readable]} + {plt_extra_apps, [thrift, prometheus, eunit, proper, common_test, cth_readable, opentelemetry]} ]} ]} ]}. diff --git a/rebar.lock b/rebar.lock index fcb28b5..02ffc10 100644 --- a/rebar.lock +++ b/rebar.lock @@ -12,7 +12,13 @@ {<<"idna">>,{pkg,<<"idna">>,<<"6.1.1">>},1}, {<<"metrics">>,{pkg,<<"metrics">>,<<"1.0.1">>},1}, {<<"mimerl">>,{pkg,<<"mimerl">>,<<"1.2.0">>},1}, + {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.2.1">>},0}, + {<<"opentelemetry_semantic_conventions">>, + {pkg,<<"opentelemetry_semantic_conventions">>,<<"0.2.0">>}, + 1}, {<<"parse_trans">>,{pkg,<<"parse_trans">>,<<"3.3.1">>},1}, + {<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.8.1">>},0}, + {<<"quantile_estimator">>,{pkg,<<"quantile_estimator">>,<<"0.2.1">>},1}, {<<"ranch">>,{pkg,<<"ranch">>,<<"1.8.0">>},1}, {<<"snowflake">>, {git,"https://github.com/valitydev/snowflake.git", @@ -31,7 +37,11 @@ {<<"idna">>, <<"8A63070E9F7D0C62EB9D9FCB360A7DE382448200FBBD1B106CC96D3D8099DF8D">>}, {<<"metrics">>, <<"25F094DEA2CDA98213CECC3AEFF09E940299D950904393B2A29D191C346A8486">>}, {<<"mimerl">>, <<"67E2D3F571088D5CFD3E550C383094B47159F3EEE8FFA08E64106CDF5E981BE3">>}, + {<<"opentelemetry_api">>, <<"7B69ED4F40025C005DE0B74FCE8C0549625D59CB4DF12D15C32FE6DC5076FF42">>}, + {<<"opentelemetry_semantic_conventions">>, <<"B67FE459C2938FCAB341CB0951C44860C62347C005ACE1B50F8402576F241435">>}, {<<"parse_trans">>, <<"16328AB840CC09919BD10DAB29E431DA3AF9E9E7E7E6F0089DD5A2D2820011D8">>}, + {<<"prometheus">>, <<"FA76B152555273739C14B06F09F485CF6D5D301FE4E9D31B7FF803D26025D7A0">>}, + {<<"quantile_estimator">>, <<"EF50A361F11B5F26B5F16D0696E46A9E4661756492C981F7B2229EF42FF1CD15">>}, {<<"ranch">>, <<"8C7A100A139FD57F17327B6413E4167AC559FBC04CA7448E9BE9057311597A1D">>}, {<<"ssl_verify_fun">>, <<"CF344F5692C82D2CD7554F5EC8FD961548D4FD09E7D22F5B62482E5AEAEBD4B0">>}, {<<"unicode_util_compat">>, <<"BC84380C9AB48177092F43AC89E4DFA2C6D62B40B8BD132B1059ECC7232F9A78">>}]}, @@ -45,7 +55,11 @@ {<<"idna">>, <<"92376EB7894412ED19AC475E4A86F7B413C1B9FBB5BD16DCCD57934157944CEA">>}, {<<"metrics">>, <<"69B09ADDDC4F74A40716AE54D140F93BEB0FB8978D8636EADED0C31B6F099F16">>}, {<<"mimerl">>, <<"F278585650AA581986264638EBF698F8BB19DF297F66AD91B18910DFC6E19323">>}, + {<<"opentelemetry_api">>, <<"6D7A27B7CAD2AD69A09CABF6670514CAFCEC717C8441BEB5C96322BAC3D05350">>}, + {<<"opentelemetry_semantic_conventions">>, <<"D61FA1F5639EE8668D74B527E6806E0503EFC55A42DB7B5F39939D84C07D6895">>}, {<<"parse_trans">>, <<"07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B">>}, + {<<"prometheus">>, <<"6EDFBE928D271C7F657A6F2C46258738086584BD6CAE4A000B8B9A6009BA23A5">>}, + {<<"quantile_estimator">>, <<"282A8A323CA2A845C9E6F787D166348F776C1D4A41EDE63046D72D422E3DA946">>}, {<<"ranch">>, <<"49FBCFD3682FAB1F5D109351B61257676DA1A2FDBE295904176D5E521A2DDFE5">>}, {<<"ssl_verify_fun">>, <<"BDB0D2471F453C88FF3908E7686F86F9BE327D065CC1EC16FA4540197EA04680">>}, {<<"unicode_util_compat">>, <<"25EEE6D67DF61960CF6A794239566599B09E17E668D3700247BC498638152521">>}]} diff --git a/src/woody.app.src b/src/woody.app.src index 2ca266b..0ae1ed0 100644 --- a/src/woody.app.src +++ b/src/woody.app.src @@ -10,7 +10,8 @@ cowboy, hackney, gproc, - cache + cache, + opentelemetry_api ]}, {optional_applications, [ thrift diff --git a/src/woody_client_thrift_http_transport.erl b/src/woody_client_thrift_http_transport.erl index 95c4cc9..874c455 100644 --- a/src/woody_client_thrift_http_transport.erl +++ b/src/woody_client_thrift_http_transport.erl @@ -128,7 +128,7 @@ send(Url, Body, Options, ResOpts, WoodyState) -> Headers = add_host_header(OldUrl, make_woody_headers(Context)), Options1 = set_defaults(Options), Options2 = set_timeouts(Options1, Context), - HeaderList = maps:to_list(Headers), + HeaderList = otel_propagator_text_map:inject(maps:to_list(Headers)), Result = hackney:request(post, NewUrl, HeaderList, Body, maps:to_list(Options2)), transform_request_results(Result); {error, Reason} -> diff --git a/src/woody_client_thrift_v2.erl b/src/woody_client_thrift_v2.erl index 383f933..dfe4ba8 100644 --- a/src/woody_client_thrift_v2.erl +++ b/src/woody_client_thrift_v2.erl @@ -155,7 +155,8 @@ send_call(Buffer, #{url := Url} = Opts, WoodyState) -> % reusing keep-alive connections to dead hosts case woody_resolver:resolve_url(Url, WoodyState, ResolverOpts) of {ok, {OldUrl, NewUrl}, ConnectOpts} -> - Headers = add_host_header(OldUrl, make_woody_headers(Context)), + Headers0 = add_host_header(OldUrl, make_woody_headers(Context)), + Headers1 = otel_propagator_text_map:inject(Headers0), TransportOpts1 = set_defaults(TransportOpts), TransportOpts2 = set_timeouts(TransportOpts1, Context), % NOTE @@ -163,7 +164,7 @@ send_call(Buffer, #{url := Url} = Opts, WoodyState) -> % `set_tls_overrides/2`. TransportOpts3 = append_connect_opts(TransportOpts2, ConnectOpts), TransportOpts4 = set_tls_overrides(TransportOpts3, OldUrl), - Result = hackney:request(post, NewUrl, Headers, Buffer, maps:to_list(TransportOpts4)), + Result = hackney:request(post, NewUrl, Headers1, Buffer, maps:to_list(TransportOpts4)), handle_response(Result, WoodyState); {error, Reason} -> Error = {error, {resolve_failed, Reason}}, diff --git a/src/woody_event_handler.erl b/src/woody_event_handler.erl index f819d41..493eea4 100644 --- a/src/woody_event_handler.erl +++ b/src/woody_event_handler.erl @@ -4,6 +4,7 @@ -export([handle_event/3, handle_event/4]). -export([get_event_severity/2]). +-export([get_event_severity/3]). -export([format_event/3, format_event/4]). -export([format_meta/2, format_meta/3]). -export([format_rpc_id/1]). @@ -221,7 +222,8 @@ -export_type([meta/0]). -type options() :: #{ - formatter_opts => woody_event_formatter:opts() + formatter_opts => woody_event_formatter:opts(), + events_severity => #{nonempty_list(atom()) => severity()} }. -export_type([options/0]). @@ -379,68 +381,77 @@ format_event(UnknownEventType, Meta, _Opts) -> {" unknown woody event type '~s' with meta ~p", [UnknownEventType, Meta]}. -spec get_event_severity(event(), event_meta()) -> severity(). -get_event_severity(?EV_CLIENT_BEGIN, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_END, _Meta) -> - debug; -get_event_severity(?EV_CALL_SERVICE, _Meta) -> - info; -get_event_severity(?EV_SERVICE_RESULT, #{status := ok}) -> - info; -get_event_severity(?EV_SERVICE_RESULT, #{status := error, stack := _Stack}) -> - error; -get_event_severity(?EV_SERVICE_RESULT, #{status := error, result := _Error}) -> - warning; -get_event_severity(?EV_CLIENT_SEND, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_RESOLVE_BEGIN, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_RESOLVE_RESULT, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_RECEIVE, #{status := ok}) -> - debug; -get_event_severity(?EV_CLIENT_RECEIVE, #{status := error}) -> - warning; -get_event_severity(?EV_SERVER_RECEIVE, _Meta) -> - debug; -get_event_severity(?EV_SERVER_SEND, #{status := ok}) -> - debug; -get_event_severity(?EV_SERVER_SEND, #{status := error}) -> - warning; -get_event_severity(?EV_INVOKE_SERVICE_HANDLER, _Meta) -> - info; -get_event_severity(?EV_SERVICE_HANDLER_RESULT, #{status := ok}) -> - info; -get_event_severity(?EV_SERVICE_HANDLER_RESULT, #{status := error, class := business}) -> - info; -get_event_severity(?EV_SERVICE_HANDLER_RESULT, #{status := error, class := system, stack := _Stack}) -> - error; -get_event_severity(?EV_SERVICE_HANDLER_RESULT, #{status := error, class := system, result := _Error}) -> - warning; -get_event_severity(?EV_CLIENT_CACHE_BEGIN, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_CACHE_END, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_CACHE_HIT, _Meta) -> - info; -get_event_severity(?EV_CLIENT_CACHE_MISS, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_CACHE_UPDATE, _Meta) -> - debug; -get_event_severity(?EV_CLIENT_CACHE_RESULT, _Meta) -> - debug; -get_event_severity(?EV_INTERNAL_ERROR, #{error := _, class := _Class, stack := _Stack}) -> - error; -get_event_severity(?EV_INTERNAL_ERROR, #{error := _, reason := _Reason}) -> - warning; -get_event_severity(?EV_TRACE, _Meta) -> - debug; -get_event_severity(_UnknownEventType, _Meta) -> - warning. +get_event_severity(Event, Meta) -> + get_event_severity(Event, Meta, #{}). + +-spec get_event_severity(event(), event_meta(), options()) -> severity(). +get_event_severity(?EV_CLIENT_BEGIN = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_END = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CALL_SERVICE = Event, _Meta, Opts) -> + map_severity([Event], Opts, info); +get_event_severity(?EV_SERVICE_RESULT = Event, #{status := ok}, Opts) -> + map_severity([Event], Opts, info); +get_event_severity(?EV_SERVICE_RESULT = Event, #{status := error, stack := _Stack}, Opts) -> + map_severity([Event, error], Opts, error); +get_event_severity(?EV_SERVICE_RESULT = Event, #{status := error, result := _Error}, Opts) -> + map_severity([Event, warning], Opts, warning); +get_event_severity(?EV_CLIENT_SEND = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_RESOLVE_BEGIN = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_RESOLVE_RESULT = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_RECEIVE = Event, #{status := ok}, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_RECEIVE = Event, #{status := error}, Opts) -> + map_severity([Event, error], Opts, warning); +get_event_severity(?EV_SERVER_RECEIVE = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_SERVER_SEND = Event, #{status := ok}, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_SERVER_SEND = Event, #{status := error}, Opts) -> + map_severity([Event, error], Opts, warning); +get_event_severity(?EV_INVOKE_SERVICE_HANDLER = Event, _Meta, Opts) -> + map_severity([Event], Opts, info); +get_event_severity(?EV_SERVICE_HANDLER_RESULT = Event, #{status := ok}, Opts) -> + map_severity([Event], Opts, info); +get_event_severity(?EV_SERVICE_HANDLER_RESULT = Event, #{status := error, class := business}, Opts) -> + map_severity([Event, error, business], Opts, info); +get_event_severity(?EV_SERVICE_HANDLER_RESULT = Event, #{status := error, class := system, stack := _Stack}, Opts) -> + map_severity([Event, error, system], Opts, error); +get_event_severity(?EV_SERVICE_HANDLER_RESULT = Event, #{status := error, class := system, result := _Error}, Opts) -> + map_severity([Event, warning], Opts, warning); +get_event_severity(?EV_CLIENT_CACHE_BEGIN = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_CACHE_END = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_CACHE_HIT = Event, _Meta, Opts) -> + map_severity([Event], Opts, info); +get_event_severity(?EV_CLIENT_CACHE_MISS = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_CACHE_UPDATE = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_CLIENT_CACHE_RESULT = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(?EV_INTERNAL_ERROR = Event, #{error := _, class := _Class, stack := _Stack}, Opts) -> + map_severity([Event, system], Opts, error); +get_event_severity(?EV_INTERNAL_ERROR = Event, #{error := _, reason := _Reason}, Opts) -> + map_severity([Event, business], Opts, warning); +get_event_severity(?EV_TRACE = Event, _Meta, Opts) -> + map_severity([Event], Opts, debug); +get_event_severity(UnknownEventType, _Meta, Opts) -> + map_severity([UnknownEventType], Opts, warning). %% %% Internal functions %% +-spec map_severity(nonempty_list(atom()), options(), severity()) -> severity(). +map_severity(Code, Opts, Default) -> + EventsSeverity = maps:get(events_severity, Opts, #{}), + maps:get(Code, EventsSeverity, Default). + -spec format_service_request(map(), options()) -> msg(). format_service_request(#{service_schema := {Module, Service}, function := Function, args := Args}, Opts) -> woody_event_formatter:format_call(Module, Service, Function, Args, get_formatter_opts(Opts)). @@ -1582,4 +1593,157 @@ exception_test_() -> ) ]. +-spec event_severity_defaults_test_() -> _. +event_severity_defaults_test_() -> + Opts = #{events_severity => default_severity_map()}, + Meta = #{role => server}, + [ + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_BEGIN, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_END, Meta, Opts)), + + ?_assertEqual(info, get_event_severity(?EV_CALL_SERVICE, Meta, Opts)), + + ?_assertEqual(info, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => ok}, Opts)), + ?_assertEqual(error, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => error, stack => []}, Opts)), + ?_assertEqual(warning, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => error, result => {}}, Opts)), + + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_SEND, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_RESOLVE_BEGIN, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_RESOLVE_RESULT, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_RECEIVE, Meta#{status => ok}, Opts)), + ?_assertEqual(warning, get_event_severity(?EV_CLIENT_RECEIVE, Meta#{status => error}, Opts)), + + ?_assertEqual(debug, get_event_severity(?EV_SERVER_RECEIVE, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_SERVER_SEND, Meta#{status => ok}, Opts)), + ?_assertEqual(warning, get_event_severity(?EV_SERVER_SEND, Meta#{status => error}, Opts)), + + ?_assertEqual(info, get_event_severity(?EV_INVOKE_SERVICE_HANDLER, Meta, Opts)), + + ?_assertEqual(info, get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => ok}, Opts)), + ?_assertEqual( + info, get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => business}, Opts) + ), + ?_assertEqual( + error, + get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => system, stack => []}, Opts) + ), + ?_assertEqual( + warning, + get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => system, result => {}}, Opts) + ), + + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_CACHE_BEGIN, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_CACHE_END, Meta, Opts)), + ?_assertEqual(info, get_event_severity(?EV_CLIENT_CACHE_HIT, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_CACHE_MISS, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_CACHE_UPDATE, Meta, Opts)), + ?_assertEqual(debug, get_event_severity(?EV_CLIENT_CACHE_RESULT, Meta, Opts)), + + ?_assertEqual( + error, get_event_severity(?EV_INTERNAL_ERROR, Meta#{error => test, class => throw, stack => []}, Opts) + ), + ?_assertEqual(warning, get_event_severity(?EV_INTERNAL_ERROR, Meta#{error => test, reason => test}, Opts)), + + ?_assertEqual(debug, get_event_severity(?EV_TRACE, Meta, Opts)) + ]. + +-spec event_severity_all_info_test_() -> _. +event_severity_all_info_test_() -> + ExpectAll = info, + Opts = #{events_severity => replace_all_with(ExpectAll, default_severity_map())}, + Meta = #{role => server}, + [ + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_BEGIN, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_END, Meta, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_CALL_SERVICE, Meta, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => ok}, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => error, stack => []}, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVICE_RESULT, Meta#{status => error, result => {}}, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_SEND, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_RESOLVE_BEGIN, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_RESOLVE_RESULT, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_RECEIVE, Meta#{status => ok}, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_RECEIVE, Meta#{status => error}, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVER_RECEIVE, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVER_SEND, Meta#{status => ok}, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVER_SEND, Meta#{status => error}, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_INVOKE_SERVICE_HANDLER, Meta, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => ok}, Opts)), + ?_assertEqual( + ExpectAll, get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => business}, Opts) + ), + ?_assertEqual( + ExpectAll, + get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => system, stack => []}, Opts) + ), + ?_assertEqual( + ExpectAll, + get_event_severity(?EV_SERVICE_HANDLER_RESULT, Meta#{status => error, class => system, result => {}}, Opts) + ), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_BEGIN, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_END, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_HIT, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_MISS, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_UPDATE, Meta, Opts)), + ?_assertEqual(ExpectAll, get_event_severity(?EV_CLIENT_CACHE_RESULT, Meta, Opts)), + + ?_assertEqual( + ExpectAll, get_event_severity(?EV_INTERNAL_ERROR, Meta#{error => test, class => throw, stack => []}, Opts) + ), + ?_assertEqual(ExpectAll, get_event_severity(?EV_INTERNAL_ERROR, Meta#{error => test, reason => test}, Opts)), + + ?_assertEqual(ExpectAll, get_event_severity(?EV_TRACE, Meta, Opts)) + ]. + +replace_all_with(NewSeverity, SeverityMap) -> + maps:map(fun(_K, _V) -> NewSeverity end, SeverityMap). + +default_severity_map() -> + #{ + ['client begin'] => debug, + ['client end'] => debug, + + ['call service'] => info, + + ['service result'] => info, + ['service result', error] => error, + ['service result', warning] => warning, + + ['client send'] => debug, + ['client resolve begin'] => debug, + ['client resolve result'] => debug, + ['client receive'] => debug, + ['client receive', error] => warning, + + ['server receive'] => debug, + ['server send'] => debug, + ['server send', error] => warning, + + ['invoke service handler'] => info, + + ['service handler result'] => info, + ['service handler result', error, business] => info, + ['service handler result', error, system] => error, + ['service handler result', warning] => warning, + + ['client cache begin'] => debug, + ['client cache end'] => debug, + ['client cache hit'] => info, + ['client cache miss'] => debug, + ['client cache update'] => debug, + ['client cache result'] => debug, + + ['internal error', system] => error, + ['internal error', business] => warning, + + ['trace event'] => debug + }. + -endif. diff --git a/src/woody_event_handler_default.erl b/src/woody_event_handler_default.erl index e7efdfc..9dcb6dd 100644 --- a/src/woody_event_handler_default.erl +++ b/src/woody_event_handler_default.erl @@ -22,7 +22,8 @@ handle_event(Event, RpcId, Meta, Opts) -> EHOpts = get_event_handler_opts(Opts), {Format, Msg} = woody_event_handler:format_event(Event, Meta, RpcId, EHOpts), - Function = get_logger_function(woody_event_handler:get_event_severity(Event, Meta)), + Severity = woody_event_handler:get_event_severity(Event, Meta, Opts), + Function = get_logger_function(Severity), _ = error_logger:Function(Format, Msg), ok. diff --git a/src/woody_event_handler_otel.erl b/src/woody_event_handler_otel.erl new file mode 100644 index 0000000..4307dac --- /dev/null +++ b/src/woody_event_handler_otel.erl @@ -0,0 +1,100 @@ +-module(woody_event_handler_otel). + +-include_lib("opentelemetry_api/include/opentelemetry.hrl"). +-include("woody_defs.hrl"). + +-behaviour(woody_event_handler). + +-export([handle_event/4]). + +-spec handle_event(Event, RpcId, Meta, Opts) -> ok when + Event :: woody_event_handler:event(), + RpcId :: woody:rpc_id() | undefined, + Meta :: woody_event_handler:event_meta(), + Opts :: woody:options(). + +-define(IS_SPAN_START(Event), Event =:= ?EV_CALL_SERVICE orelse Event =:= ?EV_INVOKE_SERVICE_HANDLER). +-define(IS_SPAN_END(Event), Event =:= ?EV_SERVICE_RESULT orelse Event =:= ?EV_SERVICE_HANDLER_RESULT). +-define(IS_CLIENT_INTERNAL(Event), + Event =:= ?EV_CLIENT_CACHE_HIT orelse + Event =:= ?EV_CLIENT_CACHE_MISS orelse + Event =:= ?EV_CLIENT_CACHE_UPDATE orelse + Event =:= ?EV_CLIENT_SEND orelse + Event =:= ?EV_CLIENT_RECEIVE +). + +%% Client events +handle_event(Event, RpcID, _Meta = #{url := Url}, _Opts) when ?IS_CLIENT_INTERNAL(Event) -> + with_span(otel_ctx:get_current(), mk_ref(RpcID), fun(SpanCtx) -> + _ = otel_span:add_event(SpanCtx, atom_to_binary(Event), #{url => Url}) + end); +%% Internal error handling +handle_event(?EV_INTERNAL_ERROR, RpcID, Meta = #{error := Error, class := Class, reason := Reason}, _Opts) -> + Stacktrace = maps:get(stack, Meta, []), + Details = io_lib:format("~ts: ~ts", [Error, Reason]), + with_span(otel_ctx:get_current(), mk_ref(RpcID), fun(SpanCtx) -> + _ = otel_span:record_exception(SpanCtx, genlib:define(Class, error), Details, Stacktrace, #{}), + otel_maybe_cleanup(Meta, SpanCtx) + end); +%% Registers span starts/ends for woody client calls and woody server function invocations. +handle_event(Event, RpcID, Meta, _Opts) when ?IS_SPAN_START(Event) -> + Tracer = opentelemetry:get_application_tracer(?MODULE), + span_start(Tracer, otel_ctx:get_current(), mk_ref(RpcID), mk_name(Meta), mk_opts(Event)); +handle_event(Event, RpcID, Meta, _Opts) when ?IS_SPAN_END(Event) -> + span_end(otel_ctx:get_current(), mk_ref(RpcID), fun(SpanCtx) -> + otel_maybe_erroneous_result(SpanCtx, Meta) + end); +handle_event(_Event, _RpcID, _Meta, _Opts) -> + ok. + +%% + +span_start(Tracer, Ctx, Key, SpanName, Opts) -> + SpanCtx = otel_tracer:start_span(Ctx, Tracer, SpanName, Opts), + Ctx1 = woody_util:span_stack_put(Key, SpanCtx, Ctx), + Ctx2 = otel_tracer:set_current_span(Ctx1, SpanCtx), + _ = otel_ctx:attach(Ctx2), + ok. + +span_end(Ctx, Key, OnBeforeEnd) -> + case woody_util:span_stack_pop(Key, Ctx) of + {error, notfound} -> + ok; + {ok, SpanCtx, ParentSpanCtx, Ctx1} -> + SpanCtx1 = OnBeforeEnd(SpanCtx), + _ = otel_span:end_span(SpanCtx1, undefined), + Ctx2 = otel_tracer:set_current_span(Ctx1, ParentSpanCtx), + _ = otel_ctx:attach(Ctx2), + ok + end. + +with_span(Ctx, Key, F) -> + SpanCtx = woody_util:span_stack_get(Key, Ctx, otel_tracer:current_span_ctx(Ctx)), + _ = F(SpanCtx), + ok. + +otel_maybe_cleanup(#{final := true}, SpanCtx) -> + _ = otel_span:end_span(SpanCtx, undefined), + otel_ctx:clear(), + ok; +otel_maybe_cleanup(_Meta, _SpanCtx) -> + ok. + +otel_maybe_erroneous_result(SpanCtx, Meta = #{status := error, result := Reason}) -> + Class = maps:get(except_class, Meta, error), + Stacktrace = maps:get(stack, Meta, []), + _ = otel_span:record_exception(SpanCtx, Class, Reason, Stacktrace, #{}), + SpanCtx; +otel_maybe_erroneous_result(SpanCtx, _Meta) -> + SpanCtx. + +mk_opts(?EV_CALL_SERVICE) -> + #{kind => ?SPAN_KIND_CLIENT}; +mk_opts(?EV_INVOKE_SERVICE_HANDLER) -> + #{kind => ?SPAN_KIND_SERVER}. + +mk_ref(#{span_id := WoodySpanId}) -> + WoodySpanId. + +mk_name(#{role := Role, service := Service, function := Function}) -> + woody_util:to_binary([Role, " ", Service, ":", Function]). diff --git a/src/woody_hackney_prometheus.erl b/src/woody_hackney_prometheus.erl new file mode 100644 index 0000000..193b011 --- /dev/null +++ b/src/woody_hackney_prometheus.erl @@ -0,0 +1,134 @@ +%% Translates hackney metrics for prometheus. +%% See +%% https://github.com/benoitc/hackney/tree/1.18.0?tab=readme-ov-file#metrics +%% +%% For pool metrics see dedicated collector module +%% `woody_hackney_prometheus_collector`. +-module(woody_hackney_prometheus). + +-export([ + new/2, + delete/1, + increment_counter/1, + increment_counter/2, + decrement_counter/1, + decrement_counter/2, + update_histogram/2, + update_gauge/2, + update_meter/2 +]). + +-type name() :: any(). +-type metric() :: counter | histogram | gauge | meter. + +%% API + +-define(NB_REQUESTS, hackney_nb_requests). +-define(TOTAL_REQUESTS, hackney_total_requests). +-define(HOST_NB_REQUESTS, hackney_host_nb_requests). +-define(HOST_CONNECT_TIMEOUT, hackney_host_connect_timeout). +-define(HOST_CONNECT_ERROR, hackney_host_connect_error). +-define(HOST_NEW_CONNECTION, hackney_host_new_connection). +-define(HOST_REUSE_CONNECTION, hackney_host_reuse_connection). + +%% Ignore unsupported metric +-spec new(metric(), name()) -> ok. +%% Total counters +new(counter, [hackney, nb_requests]) -> + true = prometheus_gauge:declare([ + {name, ?NB_REQUESTS}, + {registry, registry()}, + {labels, []}, + {help, "Number of running requests."} + ]), + %% Per host counters, see + %% https://github.com/benoitc/hackney/tree/1.18.0?tab=readme-ov-file#metrics-per-hosts + %% NOTE Hackney won't call `metrics:new/3` for those counters + true = prometheus_gauge:declare([ + {name, ?HOST_NB_REQUESTS}, + {registry, registry()}, + {labels, [host]}, + {help, "Number of running requests."} + ]), + [ + true = prometheus_counter:declare([ + {name, Name}, + {registry, registry()}, + {labels, [host]}, + {help, Help} + ]) + || {Name, Help} <- [ + {?HOST_CONNECT_TIMEOUT, "Number of connect timeout."}, + {?HOST_CONNECT_ERROR, "Number of timeout errors."}, + {?HOST_NEW_CONNECTION, "Number of new pool connections per host."}, + {?HOST_REUSE_CONNECTION, "Number of reused pool connections per host."} + ] + ], + ok; +new(counter, [hackney, total_requests]) -> + true = prometheus_counter:declare([ + {name, ?TOTAL_REQUESTS}, + {registry, registry()}, + {labels, []}, + {help, "Total number of requests."} + ]), + ok; +new(_Type, _Name) -> + ok. + +-spec delete(name()) -> ok. +delete(_Name) -> + ok. + +-spec increment_counter(name()) -> ok. +increment_counter(Name) -> + increment_counter(Name, 1). + +-spec increment_counter(name(), pos_integer()) -> ok. +increment_counter([hackney, nb_requests], Value) -> + prometheus_gauge:inc(registry(), ?NB_REQUESTS, [], Value); +increment_counter([hackney, total_requests], Value) -> + prometheus_counter:inc(registry(), ?TOTAL_REQUESTS, [], Value); +increment_counter([hackney, Host, nb_requests], Value) -> + prometheus_gauge:inc(registry(), ?HOST_NB_REQUESTS, [Host], Value); +increment_counter([hackney, Host, connect_timeout], Value) -> + prometheus_counter:inc(registry(), ?HOST_CONNECT_TIMEOUT, [Host], Value); +increment_counter([hackney, Host, connect_error], Value) -> + prometheus_counter:inc(registry(), ?HOST_CONNECT_ERROR, [Host], Value); +increment_counter([hackney_pool, Host, new_connection], Value) -> + prometheus_counter:inc(registry(), ?HOST_NEW_CONNECTION, [Host], Value); +increment_counter([hackney_pool, Host, reuse_connection], Value) -> + prometheus_counter:inc(registry(), ?HOST_REUSE_CONNECTION, [Host], Value); +increment_counter(_Name, _Value) -> + ok. + +-spec decrement_counter(name()) -> ok. +decrement_counter(Name) -> + decrement_counter(Name, 1). + +-spec decrement_counter(name(), pos_integer()) -> ok. +decrement_counter([hackney, nb_requests], Value) -> + prometheus_gauge:dec(registry(), ?NB_REQUESTS, [], Value); +decrement_counter([hackney, Host, nb_requests], Value) -> + prometheus_gauge:dec(registry(), ?HOST_NB_REQUESTS, [Host], Value); +decrement_counter(_Name, _Value) -> + ok. + +-spec update_histogram(name(), fun(() -> ok) | number()) -> ok. +update_histogram(_Name, Fun) when is_function(Fun, 0) -> + Fun(); +update_histogram(_Name, _Value) -> + ok. + +-spec update_gauge(name(), number()) -> ok. +update_gauge(_Name, _Value) -> + ok. + +-spec update_meter(name(), number()) -> ok. +update_meter(_Name, _Value) -> + ok. + +%% + +registry() -> + default. diff --git a/src/woody_hackney_prometheus_collector.erl b/src/woody_hackney_prometheus_collector.erl new file mode 100644 index 0000000..56dd06d --- /dev/null +++ b/src/woody_hackney_prometheus_collector.erl @@ -0,0 +1,84 @@ +-module(woody_hackney_prometheus_collector). + +-export([setup/0]). + +%% + +-behaviour(prometheus_collector). + +-export([collect_mf/2]). +-export([collect_metrics/2]). +-export([deregister_cleanup/1]). + +-define(POOL_USAGE, woody_hackney_pool_usage). + +%% Installation + +%% @doc Installs custom collector for hackney's pool metrics +-spec setup() -> ok. +setup() -> + prometheus_registry:register_collector(registry(), ?MODULE). + +%% Collector API + +-type data() :: [data_item()]. +-type data_item() :: {data_labels(), non_neg_integer()}. +-type data_labels() :: [{atom(), atom() | nonempty_string() | binary() | iolist()}]. +-type pool_stats() :: [{atom(), any()}]. + +-spec collect_mf(prometheus_registry:registry(), prometheus_collector:collect_mf_callback()) -> ok. +collect_mf(_Registry, Callback) -> + F = fun({Pool, _Pid}) -> + make_pool_data(Pool, get_pool_stats(Pool)) + end, + Data = lists:flatten(lists:map(F, get_hackney_pools())), + _ = Callback(create_gauge(Data)), + ok. + +-spec collect_metrics(prometheus_metric:name(), data()) -> + prometheus_model:'Metric'() | [prometheus_model:'Metric'()]. +collect_metrics(_Name, Data) -> + [prometheus_model_helpers:gauge_metric(Labels, Value) || {Labels, Value} <- Data]. + +-spec deregister_cleanup(prometheus_registry:registry()) -> ok. +deregister_cleanup(_Registry) -> + %% Nothing to clean up + ok. + +%% Private + +registry() -> + default. + +get_pool_stats(Pool) -> + %% NOTE It looks like 'hackney_pool' table data can occasionally contain + %% dead pools + try + hackney_pool:get_stats(Pool) + catch + %% "Time to make the chimi-fuckin'-changas." + exit:{noproc, _Reason} -> + [] + end. + +-spec create_gauge(data()) -> prometheus_model:'MetricFamily'(). +create_gauge(Data) -> + Help = "Connection pool status by used, free and queued connections count", + prometheus_model_helpers:create_mf(?POOL_USAGE, Help, gauge, ?MODULE, Data). + +-spec make_pool_data(atom(), pool_stats()) -> data(). +make_pool_data(Pool, Stats0) -> + Stats1 = maps:with([in_use_count, free_count, queue_count], maps:from_list(Stats0)), + lists:foldl(fun({S, V}, Data) -> [make_data_item(Pool, S, V) | Data] end, [], maps:to_list(Stats1)). + +make_data_item(Pool, Status, Value) -> + Labels = [{pool, Pool}, {status, Status}], + {Labels, Value}. + +get_hackney_pools() -> + %% Shamelessly pasted from + %% https://github.com/soundtrackyourbrand/prometheus-hackney-collector + %% + %% Basically, we rely on not publicly exposed table containing started pools + %% under hackney's hood. + ets:tab2list(hackney_pool). diff --git a/src/woody_ranch_prometheus_collector.erl b/src/woody_ranch_prometheus_collector.erl new file mode 100644 index 0000000..2ff92f8 --- /dev/null +++ b/src/woody_ranch_prometheus_collector.erl @@ -0,0 +1,84 @@ +-module(woody_ranch_prometheus_collector). + +-export([setup/0]). + +%% + +-behaviour(prometheus_collector). + +-export([collect_mf/2]). +-export([collect_metrics/2]). +-export([deregister_cleanup/1]). + +-define(CONNECTIONS, woody_ranch_listener_connections). + +%% Installation + +-spec setup() -> ok. +setup() -> + prometheus_registry:register_collector(registry(), ?MODULE). + +%% Collector API + +-type data() :: [data_item()]. +-type data_item() :: {data_labels(), non_neg_integer()}. +-type data_labels() :: [{atom(), atom() | nonempty_string() | binary() | iolist()}]. +-type maybe_woody_server_ref() :: {module(), ID :: atom()} | ranch:ref(). +-type ranch_info() :: + [{maybe_woody_server_ref(), [{atom(), any()}]}] + | #{maybe_woody_server_ref() => #{atom() => any()}}. + +-spec collect_mf(prometheus_registry:registry(), prometheus_collector:collect_mf_callback()) -> ok. +collect_mf(_Registry, Callback) -> + F = fun({ListenerRef, ListenerInfo}) -> + make_listener_data(ListenerRef, ListenerInfo) + end, + Data = lists:flatten(lists:map(F, get_listeners_info())), + _ = Callback(create_gauge(Data)), + ok. + +-spec collect_metrics(prometheus_metric:name(), data()) -> + prometheus_model:'Metric'() | [prometheus_model:'Metric'()]. +collect_metrics(_Name, Data) -> + [prometheus_model_helpers:gauge_metric(Labels, Value) || {Labels, Value} <- Data]. + +-spec deregister_cleanup(prometheus_registry:registry()) -> ok. +deregister_cleanup(_Registry) -> + %% Nothing to clean up + ok. + +%% Private + +registry() -> + default. + +-spec create_gauge(data()) -> prometheus_model:'MetricFamily'(). +create_gauge(Data) -> + Help = "Number of active connections", + prometheus_model_helpers:create_mf(?CONNECTIONS, Help, gauge, ?MODULE, Data). + +-spec make_listener_data(maybe_woody_server_ref(), #{atom() => any()}) -> data(). +make_listener_data(Ref, #{active_connections := V}) -> + Labels = [{listener, Ref}], + [{Labels, V}]; +make_listener_data(_Ref, _Info) -> + []. + +get_listeners_info() -> + lists:filter( + fun + ({_Ref, #{active_connections := _}}) -> true; + (_Else) -> false + end, + %% See https://ninenines.eu/docs/en/ranch/1.8/guide/listeners/#_obtaining_information_about_listeners + normalize_listeners_info(ranch:info()) + ). + +-dialyzer({no_match, normalize_listeners_info/1}). +-spec normalize_listeners_info(ranch_info()) -> [{maybe_woody_server_ref(), #{atom() => any()}}]. +%% Ranch v2 +normalize_listeners_info(#{} = Info) -> + maps:to_list(Info); +%% Ranch v1 +normalize_listeners_info(Info) -> + lists:map(fun({Ref, ListenerInfo}) -> {Ref, maps:from_list(ListenerInfo)} end, Info). diff --git a/src/woody_server_thrift_http_handler.erl b/src/woody_server_thrift_http_handler.erl index 824696c..897d9c7 100644 --- a/src/woody_server_thrift_http_handler.erl +++ b/src/woody_server_thrift_http_handler.erl @@ -393,6 +393,7 @@ check_deadline(Deadline, Req, State = #{url := Url, woody_state := WoodyState}) -spec check_metadata_headers(woody:http_headers(), cowboy_req:req(), state()) -> check_result(). check_metadata_headers(Headers, Req, State = #{woody_state := WoodyState, regexp_meta := ReMeta}) -> + _OtelCtx = otel_propagator_text_map:extract(maps:to_list(Headers)), WoodyState1 = set_metadata(find_metadata(Headers, ReMeta), WoodyState), {ok, Req, update_woody_state(State, WoodyState1, Req)}. diff --git a/src/woody_server_thrift_v2.erl b/src/woody_server_thrift_v2.erl index a9c6e3c..2caaf61 100644 --- a/src/woody_server_thrift_v2.erl +++ b/src/woody_server_thrift_v2.erl @@ -407,6 +407,7 @@ check_deadline(Deadline, Req, State = #{url := Url, woody_state := WoodyState}) -spec check_metadata_headers(woody:http_headers(), cowboy_req:req(), state()) -> check_result(). check_metadata_headers(Headers, Req, State = #{woody_state := WoodyState, regexp_meta := ReMeta}) -> + _OtelCtx = otel_propagator_text_map:extract(maps:to_list(Headers)), WoodyState1 = set_metadata(find_metadata(Headers, ReMeta), WoodyState), {ok, Req, update_woody_state(State, WoodyState1, Req)}. diff --git a/src/woody_util.erl b/src/woody_util.erl index 0172fd3..7c7e5fc 100644 --- a/src/woody_util.erl +++ b/src/woody_util.erl @@ -11,6 +11,12 @@ -define(DEFAULT_HANDLER_OPTS, undefined). +%% + +-export([span_stack_put/3]). +-export([span_stack_get/3]). +-export([span_stack_pop/2]). + %% %% Internal API %% @@ -51,3 +57,41 @@ get_rpc_type({Module, Service}, Function) -> -spec get_rpc_reply_type(_ThriftReplyType) -> woody:rpc_type(). get_rpc_reply_type(oneway_void) -> cast; get_rpc_reply_type(_) -> call. + +%% OTEL context span helpers +%% NOTE Those helpers are designed specifically to manage stacking spans during +%% woody client (or server) calls _inside_ one single process context. +%% Thus, use of process dictionary via `otel_ctx'. + +-define(OTEL_SPANS_STACK, 'spans_ctx_stack'). + +-type span_key() :: atom() | binary() | string(). +-type maybe_span_ctx() :: opentelemetry:span_ctx() | undefined. + +-spec span_stack_put(span_key(), opentelemetry:span_ctx(), otel_ctx:t()) -> otel_ctx:t(). +span_stack_put(Key, SpanCtx, Context) -> + Stack = otel_ctx:get_value(Context, ?OTEL_SPANS_STACK, []), + Entry = {Key, SpanCtx, otel_tracer:current_span_ctx(Context)}, + otel_ctx:set_value(Context, ?OTEL_SPANS_STACK, [Entry | Stack]). + +-spec span_stack_get(span_key(), otel_ctx:t(), maybe_span_ctx()) -> maybe_span_ctx(). +span_stack_get(Key, Context, Default) -> + Stack = otel_ctx:get_value(Context, ?OTEL_SPANS_STACK, []), + case lists:keyfind(Key, 1, Stack) of + false -> + Default; + {_Key, SpanCtx, _ParentSpanCtx} -> + SpanCtx + end. + +-spec span_stack_pop(span_key(), otel_ctx:t()) -> + {ok, opentelemetry:span_ctx(), maybe_span_ctx(), otel_ctx:t()} | {error, notfound}. +span_stack_pop(Key, Context) -> + Stack = otel_ctx:get_value(Context, ?OTEL_SPANS_STACK, []), + case lists:keytake(Key, 1, Stack) of + false -> + {error, notfound}; + {value, {_Key, SpanCtx, ParentSpanCtx}, Stack1} -> + Context1 = otel_ctx:set_value(Context, ?OTEL_SPANS_STACK, Stack1), + {ok, SpanCtx, ParentSpanCtx, Context1} + end. diff --git a/test/woody_ct_otel_collector.erl b/test/woody_ct_otel_collector.erl new file mode 100644 index 0000000..9617cb4 --- /dev/null +++ b/test/woody_ct_otel_collector.erl @@ -0,0 +1,97 @@ +-module(woody_ct_otel_collector). + +-behaviour(gen_server). + +-export([ + start_link/0, + get_trace/1, + get_traces/0 +]). + +-export([ + init/1, + handle_call/3, + handle_cast/2, + handle_info/2 +]). + +-include_lib("opentelemetry/include/otel_span.hrl"). + +-type span() :: #span{}. + +-type span_node() :: #{span := span(), children := [span_node()]}. + +-type trace() :: #{ + id := opentelemetry:trace_id(), + node := span_node() +}. + +% + +-spec start_link() -> {ok, pid()}. +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec get_trace(opentelemetry:trace_id()) -> {ok, trace()} | {error, notfound}. +get_trace(TraceId) -> + gen_server:call(?MODULE, {trace, TraceId}). + +-spec get_traces() -> {ok, [trace()]}. +get_traces() -> + gen_server:call(?MODULE, traces). + +-spec init(_) -> {ok, _}. +init(_Opts) -> + {ok, #{}}. + +-spec handle_info(_, T) -> {noreply, T}. +handle_info({span, Span}, State0) -> + State1 = maps:update_with(Span#span.trace_id, fun(V) -> [Span | V] end, [Span], State0), + {noreply, State1}; +handle_info(_Msg, State) -> + {noreply, State}. + +-spec handle_call(_, _, T) -> {noreply, T}. +handle_call(traces, _From, State) -> + Result = maps:map(fun(TraceId, Spans) -> build_trace(TraceId, Spans) end, State), + {reply, maps:values(Result), State}; +handle_call({trace, TraceId}, _From, State) -> + Result = + case maps:get(TraceId, State, undefined) of + undefined -> {error, notfound}; + Spans -> {ok, build_trace(TraceId, Spans)} + end, + {reply, Result, State}; +handle_call(_Msg, _From, State) -> + {noreply, State}. + +-spec handle_cast(_, T) -> {noreply, T}. +handle_cast(_Msg, State) -> + {noreply, State}. + +% + +build_trace(TraceId, Spans0) -> + Spans1 = lists:sort(fun(#span{start_time = A}, #span{start_time = B}) -> A < B end, Spans0), + [RootSpan | _] = lists:filter( + fun + (#span{parent_span_id = undefined}) -> true; + (_) -> false + end, + Spans1 + ), + #{ + id => TraceId, + node => lists:foldl(fun(Span, RootNode) -> update_node(Span, RootNode) end, new_span_node(RootSpan), Spans1) + }. + +update_node( + Span = #span{parent_span_id = ParentId}, + SpanNode = #{span := #span{span_id = ParentId}, children := Children} +) -> + SpanNode#{children => [new_span_node(Span) | Children]}; +update_node(Span, SpanNode = #{children := Children}) -> + SpanNode#{children => lists:map(fun(Child) -> update_node(Span, Child) end, Children)}. + +new_span_node(Span) -> + #{span => Span, children => []}. diff --git a/test/woody_tests_SUITE.erl b/test/woody_tests_SUITE.erl index a8e0f06..9de03d5 100644 --- a/test/woody_tests_SUITE.erl +++ b/test/woody_tests_SUITE.erl @@ -3,6 +3,9 @@ -include_lib("common_test/include/ct.hrl"). -include_lib("stdlib/include/assert.hrl"). -include_lib("hackney/include/hackney_lib.hrl"). +-include_lib("opentelemetry_api/include/opentelemetry.hrl"). +-include_lib("opentelemetry/include/otel_span.hrl"). +-include_lib("prometheus/include/prometheus_model.hrl"). -include("woody_test_thrift.hrl"). -include("woody_defs.hrl"). @@ -111,6 +114,11 @@ -define(WEAPON_STACK_OVERFLOW, pos_out_of_boundaries). -define(BAD_POWERUP_REPLY, powerup_unknown). +-define(OTEL_SPAN(Name, Children), #{span := #span{name = Name}, children := Children}). +-define(OTEL_SPAN(Name, SpanAttributes, Children), #{ + span := #span{name = Name, attributes = SpanAttributes}, children := Children +}). + -type config() :: [{atom(), any()}]. -type case_name() :: atom(). -type group_name() :: atom(). @@ -273,6 +281,8 @@ cross_test_groups() -> } }. +-define(RANCH_REF, woody_ct). + %% %% starting/stopping %% @@ -280,20 +290,13 @@ init_per_suite(C) -> % dbg:tracer(), dbg:p(all, c), % dbg:tpl({woody_joint_workers, '_', '_'}, x), %%Apps = genlib_app:start_application_with(woody, [{trace_http_server, true}]), - application:set_env(hackney, mod_metrics, woody_client_metrics), - application:set_env(woody, woody_client_metrics_options, #{ - metric_key_mapping => #{ - [hackney, nb_requests] => [hackney, requests_in_process] - } - }), - application:set_env(how_are_you, metrics_handlers, [ - {woody_api_hay, #{ - interval => 1000 - }} - ]), + ok = application:set_env(hackney, mod_metrics, woody_hackney_prometheus), + {ok, MetricsApps} = application:ensure_all_started(prometheus), {ok, Apps} = application:ensure_all_started(woody), - {ok, HayApps} = application:ensure_all_started(how_are_you), - [{apps, HayApps ++ Apps} | C]. + ok = woody_ranch_prometheus_collector:setup(), + ok = woody_hackney_prometheus_collector:setup(), + {ok, OtelApps} = setup_opentelemetry(), + [{apps, OtelApps ++ MetricsApps ++ Apps} | C]. end_per_suite(C) -> % unset so it won't report metrics next suite @@ -326,21 +329,21 @@ init_per_testcase(find_multiple_pools_test, C) -> {ok, Sup} = start_tc_sup(), Pool1 = {swords, 15000, 100}, Pool2 = {shields, undefined, 50}, - ok = start_woody_server_with_pools(woody_ct, Sup, ['Weapons', 'Powerups'], [Pool1, Pool2], C), + ok = start_woody_server_with_pools(?RANCH_REF, Sup, ['Weapons', 'Powerups'], [Pool1, Pool2], C), [{sup, Sup} | C]; init_per_testcase(calls_with_cache, C) -> {ok, Sup} = start_tc_sup(), {ok, _} = start_caching_client(caching_client_ct, Sup), - {ok, _} = start_woody_server(woody_ct, Sup, ['Weapons', 'Powerups'], C), + {ok, _} = start_woody_server(?RANCH_REF, Sup, ['Weapons', 'Powerups'], C), [{sup, Sup} | C]; init_per_testcase(server_handled_client_timeout_test, C) -> {ok, Sup} = start_tc_sup(), {ok, _} = supervisor:start_child(Sup, server_timeout_event_handler:child_spec()), - {ok, _} = start_woody_server(woody_ct, Sup, ['Weapons', 'Powerups'], server_timeout_event_handler, C), + {ok, _} = start_woody_server(?RANCH_REF, Sup, ['Weapons', 'Powerups'], server_timeout_event_handler, C), [{sup, Sup} | C]; init_per_testcase(_, C) -> {ok, Sup} = start_tc_sup(), - {ok, _} = start_woody_server(woody_ct, Sup, ['Weapons', 'Powerups'], C), + {ok, _} = start_woody_server(?RANCH_REF, Sup, ['Weapons', 'Powerups'], C), [{sup, Sup} | C]. init_per_group(woody_resolver, Config) -> @@ -352,6 +355,20 @@ init_per_group(Name, Config) -> end_per_group(_Name, _Config) -> ok. +setup_opentelemetry() -> + ok = application:set_env([ + {opentelemetry, [ + {text_map_propagators, [ + baggage, + trace_context + ]}, + {span_processor, simple}, + {traces_exporter, {otel_exporter_pid, woody_ct_otel_collector}} + ]} + ]), + ok = application:start(opentelemetry), + {ok, [opentelemetry]}. + start_tc_sup() -> supervisor:start_link({local, ?MODULE}, ?MODULE, []). @@ -359,7 +376,7 @@ start_error_server(TC, Sup) -> Code = get_fail_code(TC), Dispatch = cowboy_router:compile([{'_', [{?PATH_WEAPONS, ?MODULE, Code}]}]), Server = ranch:child_spec( - woody_ct, + ?RANCH_REF, ranch_tcp, [{ip, ?SERVER_IP}, {port, ?SERVER_PORT}], cowboy_clear, @@ -673,8 +690,37 @@ call_pass_thru_ok_test(C) -> Armor = <<"AntiGrav Boots">>, Context = make_context(<<"call_pass_thru_ok">>), Expect = {ok, genlib_map:get(Armor, powerups())}, + Baggage = #{<<"service">> => <<"Powerups">>}, + SpanName = <<"test Powerups:proxy_get_powerup">>, + %% NOTE: `otel_baggage` uses `otel_ctx` that relies on __process dictionary__ + ok = otel_baggage:set(Baggage), + Tracer = opentelemetry:get_application_tracer(?MODULE), + %% NOTE: Starts span and sets its context as current span in __process dictionary__ + SpanCtx = otel_tracer:set_current_span(otel_tracer:start_span(Tracer, SpanName, #{})), Expect = call(Context, 'Powerups', proxy_get_powerup, {Armor, self_to_bin()}, C), - {ok, _} = receive_msg(Armor, Context). + %% NOTE: Ends span and puts context into __process dictionary__ + _ = otel_tracer:set_current_span(otel_span:end_span(SpanCtx, undefined)), + {ok, _} = receive_msg(Armor, Context), + {ok, #{node := Root}} = woody_ct_otel_collector:get_trace(SpanCtx#span_ctx.trace_id), + %% Prepare otel attributes to match against + ProxyAttrs = mk_otel_attributes(Baggage), + Attrs = mk_otel_attributes(Baggage#{<<"proxied">> => <<"true">>}), + ?assertMatch( + ?OTEL_SPAN(SpanName, [ + ?OTEL_SPAN(<<"client Powerups:proxy_get_powerup">>, [ + %% Upon invocation event baggage is expected to be put in span attributes + ?OTEL_SPAN(<<"server Powerups:proxy_get_powerup">>, ProxyAttrs, [ + %% New client starts call here + ?OTEL_SPAN(<<"client Powerups:get_powerup">>, [ + ?OTEL_SPAN(<<"server Powerups:get_powerup">>, Attrs, [ + %% Expect no child spans uphere + ]) + ]) + ]) + ]) + ]), + Root + ). call_pass_thru_except_test(C) -> Armor = <<"Shield Belt">>, @@ -775,7 +821,18 @@ call_fail_w_no_headers(Id, Class, _Code) -> find_multiple_pools_test(_) -> true = is_pid(hackney_pool:find_pool(swords)), - true = is_pid(hackney_pool:find_pool(shields)). + true = is_pid(hackney_pool:find_pool(shields)), + MF = smuggle_mf_return_value(fun(F) -> woody_hackney_prometheus_collector:collect_mf(default, F) end), + %% We can't know order and values from hackney's pool info, but we can + %% expect that values for those pools must be provided. However exact number + %% of values can vary based on order of testcase execution and other + %% side-effects. + ValuesCount = length([swords, shields]) * length([queue_count, in_use_count, free_count]), + ?assertMatch( + #'MetricFamily'{type = 'GAUGE', name = <<"woody_hackney_pool_usage">>, metric = Values} when + length(Values) >= ValuesCount, + MF + ). call_thrift_multiplexed_test(_) -> Client = make_thrift_multiplexed_client( @@ -985,7 +1042,7 @@ init(_) -> {ok, { {one_for_one, 1, 1}, - [] + [#{id => woody_ct_otel_collector, start => {woody_ct_otel_collector, start_link, []}}] }}. %% @@ -1015,6 +1072,8 @@ handle_function(ProxyGetPowerup, {Name, To}, Context, _Opts) when ProxyGetPowerup =:= proxy_get_powerup orelse ProxyGetPowerup =:= bad_proxy_get_powerup -> + %% NOTE: Merges baggage, requires #{binary() => binary()} + ok = otel_baggage:set(#{<<"proxied">> => <<"true">>}), % NOTE % Client may return `{exception, _}` tuple with some business level exception % here, yet handler expects us to `throw/1` them. This is expected here it @@ -1054,6 +1113,12 @@ handle_event( -> _ = handle_proxy_event(Event, Code, TraceId, ParentId), log_event(Event, RpcId, Meta); +%% Handle invocation +handle_event(Event = ?EV_INVOKE_SERVICE_HANDLER, RpcId, Meta, _) -> + log_event(Event, RpcId, Meta), + SpanCtx = otel_tracer:current_span_ctx(), + _ = otel_span:set_attributes(SpanCtx, maps:map(fun(_Key, {Value, _Metadata}) -> Value end, otel_baggage:get_all())), + ok; handle_event(Event, RpcId, Meta, _) -> log_event(Event, RpcId, Meta). @@ -1084,6 +1149,7 @@ handle_proxy_event(Event, Code, Descr) -> erlang:error(badarg, [Event, Code, Descr]). log_event(Event, RpcId, Meta) -> + ok = woody_event_handler_otel:handle_event(Event, RpcId, Meta, []), woody_ct_event_h:handle_event(Event, RpcId, Meta, []). %% @@ -1169,6 +1235,7 @@ gun_test_basic(Id, Gun, Expect, C) -> catch Class:Reason -> ok end, + ok = assert_connections_metrics(), check_msg(Gun, Context). get_except({ok, _}) -> @@ -1285,3 +1352,50 @@ handle_sleep(Context) -> BinTimer -> timer:sleep(binary_to_integer(BinTimer)) end. + +mk_otel_attributes(Attributes) -> + SpanAttributeCountLimit = otel_span_limits:attribute_count_limit(), + SpanAttributeValueLengthLimit = otel_span_limits:attribute_value_length_limit(), + otel_attributes:new(Attributes, SpanAttributeCountLimit, SpanAttributeValueLengthLimit). + +assert_connections_metrics() -> + MF = smuggle_mf_return_value(fun(F) -> woody_ranch_prometheus_collector:collect_mf(default, F) end), + %% Sadly we can't match on listener ref since its an iodata representation of tuple. + ?assertMatch( + #'MetricFamily'{ + name = <<"woody_ranch_listener_connections">>, + type = 'GAUGE', + metric = [ + #'Metric'{ + label = [#'LabelPair'{name = <<"listener">>, value = _Ref}], + gauge = #'Gauge'{value = V} + } + ] + } when V >= 1, + MF + ), + %% NOTE See `prometheus_model_helpers:ensure_binary_or_string/1` + RefImpl1 = io_lib:format("~p", [{woody_server_thrift_http_handler, ?RANCH_REF}]), + RefImpl2 = io_lib:format("~p", [{woody_server_thrift_v2, ?RANCH_REF}]), + ?assertMatch( + Expected when Expected =:= RefImpl1 orelse Expected =:= RefImpl2, + get_metric_ref(hd(MF#'MetricFamily'.metric)) + ). + +get_metric_ref(#'Metric'{label = Labels}) -> + get_metric_ref_(Labels). + +get_metric_ref_([]) -> + undefined; +get_metric_ref_([#'LabelPair'{name = <<"listener">>, value = Ref} | _]) -> + Ref; +get_metric_ref_([_Label | Labels]) -> + get_metric_ref_(Labels). + +%% Collector module expects 'ok' return. Because of that dialyzer won't quit bitching. +smuggle_mf_return_value(Fun) -> + _ = Fun(fun(MF) -> + _ = erlang:put(smuggle, MF), + ok + end), + erlang:get(smuggle).