From 3ad4fd81752f0f8882faee7897b90723be4b35d6 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 8 Jan 2025 16:50:47 +0100 Subject: [PATCH 01/75] Create only one tls connector for all jetstream sockets --- src/database/repo_indexer.rs | 2 ++ src/websocket/conn.rs | 26 ++++---------------------- src/websocket/mod.rs | 32 +++++++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index d6039ef..059811e 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -8,12 +8,14 @@ use futures::stream::{Stream, TryStreamExt}; use ipld_core::cid::Cid; use iroh_car::CarReader; use log::{debug, error, info, warn}; +use reqwest::Client; use serde::{Deserialize, Serialize}; use std::{ collections::{BTreeMap, BTreeSet}, sync::Arc, }; use surrealdb::{engine::any::Any, Surreal}; +use tokio::sync::Semaphore; pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { let mut processed_dids: BTreeSet = BTreeSet::new(); diff --git a/src/websocket/conn.rs b/src/websocket/conn.rs index 1ba1d0c..374bc8b 100644 --- a/src/websocket/conn.rs +++ b/src/websocket/conn.rs @@ -1,4 +1,4 @@ -use std::{future::Future, sync::Arc}; +use std::future::Future; use anyhow::Context; use fastwebsockets::{handshake, WebSocket}; @@ -11,13 +11,7 @@ use hyper::{ use hyper_util::rt::TokioIo; use log::{debug, info}; use tokio::{net::TcpStream, task}; -use tokio_rustls::{ - rustls::{ - pki_types::{pem::PemObject, CertificateDer, ServerName}, - ClientConfig, RootCertStore, - }, - TlsConnector, -}; +use tokio_rustls::{rustls::pki_types::ServerName, TlsConnector}; /// A tokio executor for hyper struct TokioExecutor; @@ -36,18 +30,9 @@ where /// Connect to a websocket server pub async fn connect_tls( host: &String, - certificate: &String, + connector: &TlsConnector, cursor: Option, ) -> anyhow::Result>> { - // prepare tls store - debug!(target: "indexer", "Creating tls store for certificate: {}", certificate); - let mut tls_store = RootCertStore::empty(); - let tls_cert = CertificateDer::from_pem_file(certificate) - .with_context(|| format!("Unable to parse certificate from: {}", certificate))?; - tls_store - .add(tls_cert) - .with_context(|| format!("Unable to add certificate to tls store: {}", certificate))?; - // create tcp connection to server debug!(target: "indexer", "Connecting to: {}", host); let addr = format!("{}:443", host); @@ -57,10 +42,7 @@ pub async fn connect_tls( // encrypt the tcp stream with tls debug!(target: "indexer", "Establishing tls connection to: {}", host); - let tls_config = ClientConfig::builder() - .with_root_certificates(tls_store) - .with_no_client_auth(); - let connector = TlsConnector::from(Arc::new(tls_config)); + let tls_domain = ServerName::try_from(host.clone()) .with_context(|| format!("Invalid dns name: {}", host))?; let tls_stream = connector diff --git a/src/websocket/mod.rs b/src/websocket/mod.rs index 66085dd..1c01dd6 100644 --- a/src/websocket/mod.rs +++ b/src/websocket/mod.rs @@ -13,6 +13,13 @@ use hyper_util::rt::TokioIo; use log::{info, trace, warn}; use surrealdb::{engine::any::Any, Surreal}; use tokio::time::sleep; +use tokio_rustls::{ + rustls::{ + pki_types::{pem::PemObject, CertificateDer}, + ClientConfig, RootCertStore, + }, + TlsConnector, +}; mod conn; pub mod events; @@ -40,6 +47,29 @@ pub async fn start( cursor: u64, db: Surreal, ) -> anyhow::Result<()> { + // prepare tls store + let cloned_certificate_path = certificate.clone(); + log::debug!(target: "indexer", "Creating tls store for certificate: {}", cloned_certificate_path); + let mut tls_store = RootCertStore::empty(); + let tls_cert = CertificateDer::from_pem_file(certificate).with_context(|| { + format!( + "Unable to parse certificate from: {}", + cloned_certificate_path + ) + })?; + tls_store.add(tls_cert).with_context(|| { + format!( + "Unable to add certificate to tls store: {}", + cloned_certificate_path + ) + })?; + let tls_config = Arc::new( + ClientConfig::builder() + .with_root_certificates(Arc::new(tls_store)) + .with_no_client_auth(), + ); + let connector = TlsConnector::from(tls_config.clone()); + // create a shared state info!(target: "indexer", "Entering websocket loop"); let state = Arc::new(SharedState { @@ -62,7 +92,7 @@ pub async fn start( // create websocket connection info!(target: "indexer", "Establishing new connection to: {}", host); - let ws = conn::connect_tls(&host, &certificate, cursor).await; + let ws = conn::connect_tls(&host, &connector, cursor).await; if let Err(e) = ws { warn!(target: "indexer", "Unable to open websocket connection to {}: {:?}", host, e); sleep(Duration::from_secs(5)).await; From ceadafcc6b9c70dd0706532c83bfcc9baf1ba3b1 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 20 Feb 2025 11:19:05 +0100 Subject: [PATCH 02/75] Format Cargo.toml --- Cargo.toml | 56 ++++++++++++++++++------------------------------------ 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 97603de..db3fbc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,61 +6,43 @@ authors = ["redsolver", "PancakeTAS"] description = "ATProto/Bluesky Indexer powered by SurrealDB and Jetstream" [dependencies] -anyhow = "1.0.96" - -hyper = "1.6.0" +anyhow = "1.0.93" +hyper = "1.5.1" hyper-util = "0.1.10" - -tokio = { version = "1.43.0", features = ["parking_lot", "rt-multi-thread"] } -tokio-rustls = "0.26.1" +tokio = { version = "1.41.1", features = ["parking_lot", "rt-multi-thread"] } +tokio-rustls = "0.26.0" tokio-util = { version = "0.7.13", features = ["io"] } - rsky-pds = { git = "https://github.com/blacksky-algorithms/rsky.git" } - -fastwebsockets = { version = "0.10.0", features = ["upgrade"] } - -atrium-api = { version = "0.25.0", default-features = false, features = [ +fastwebsockets = { version = "0.8.0", features = ["upgrade"] } +atrium-api = { version = "0.24.8", default-features = false, features = [ "namespace-appbsky", "namespace-chatbsky", ] } - -serde = { version = "1.0.218", features = ["derive"] } +serde = { version = "1.0.215", features = ["derive"] } simd-json = "0.14.3" - num_cpus = "1.16.0" - -log = "0.4.25" - -clap = { version = "4.5.30", features = ["derive"] } - +log = "0.4.22" +clap = { version = "4.5.21", features = ["derive"] } colog = "1.3.0" -colored = "3.0.0" -chrono = "0.4.39" - +colored = "2.1.0" +chrono = "0.4.38" mimalloc = "0.1.43" - -surrealdb = { version = "2.2.1", features = ["kv-mem", "kv-rocksdb"] } +surrealdb = { version = "2.1.3", features = ["kv-mem", "kv-rocksdb"] } surrealdb-tikv-client = "0.3.0-surreal.1" - regex = "1.11.1" lazy_static = "1.5.0" -ipld-core = "0.4.2" - -atrium-xrpc-client = "0.5.11" -reqwest = { version = "0.12.12", features = ["json", "stream"] } - +ipld-core = "0.4.1" +atrium-xrpc-client = "0.5.10" +reqwest = { version = "0.12.9", features = ["json", "stream"] } iroh-car = "0.5.1" futures = "0.3.31" -serde_ipld_dagcbor = "0.6.2" +serde_ipld_dagcbor = "0.6.1" serde_bytes = "0.11.15" async-channel = "2.3.1" -console-subscriber = "0.4.1" -rand = "0.9.0" -bytes = "1.10.0" [profile.release] -lto = true -# strip = true +lto = false +strip = true opt-level = 3 -# panic = 'abort' +panic = 'abort' codegen-units = 1 From f055a3be745f8f94427ab225731fd9a7255e9c85 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 20 Feb 2025 11:19:52 +0100 Subject: [PATCH 03/75] Remove outdated information from readme --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e10e63a..00f674d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # SkyFeed Indexer + ATProto/Bluesky Indexer, powered by [SurrealDB](https://github.com/surrealdb/surrealdb) and [Jetstream](https://github.com/bluesky-social/jetstream), written in [Rust](https://www.rust-lang.org/). The indexer attaches a websocket to a Jetstream endpoint and converts all received events to SurrealDB queries. Temporary outtages are handled by the cursor system, which allows the indexer to resume indexing from the last known event. @@ -6,9 +7,10 @@ The indexer attaches a websocket to a Jetstream endpoint and converts all receiv The database can then be used to run powerful queries on the network data or build advanced custom feeds. All skyfeed.xyz feeds are powered by this service. ## Installation + 1. Install the latest stable rust compiler from [rustup.rs](https://rustup.rs/). 2. Install either onto your system or into a docker container a [SurrealDB](https://surrealdb.com/docs/surrealdb/installation/running). -3. Generate a secure password, which may be generated using `openssl rand -base64 32` or `pwgen -s 32 1`. -4. Launch SurrealDB with the following flags: `surreal start --user root --pass --bind 127.0.0.1:8000 :`. -5. Clone the repository and run `cargo build --release`. -6. Launch the indexer with `./target/release/skyfeed-indexer [--help]`. +3. Clone the repository and run `cargo build --release`. +4. Launch the indexer with `./target/release/skyfeed-indexer [--help]`. + +You may need to increase the ulimit for the number of open files. You can do this by running `ulimit -n 1000000`. From 389e8ecbdf9fe18fce955696397b60ef56fbc173 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 20 Feb 2025 12:56:28 +0100 Subject: [PATCH 04/75] Move the index_repo functionality into a separate module --- src/database/repo_indexer.rs | 168 +-------------------- src/database/repo_indexer/index_repo.rs | 185 ++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 161 deletions(-) create mode 100644 src/database/repo_indexer/index_repo.rs diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 059811e..a5cd30a 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,31 +1,22 @@ -use crate::database::{handlers::on_commit_event_createorupdate, utils::unsafe_user_key_to_did}; +use crate::database::utils::unsafe_user_key_to_did; use anyhow::Context; -use atrium_api::{ - record::KnownRecord, - types::string::{Did, RecordKey}, -}; -use futures::stream::{Stream, TryStreamExt}; -use ipld_core::cid::Cid; -use iroh_car::CarReader; +use index_repo::index_repo; use log::{debug, error, info, warn}; use reqwest::Client; use serde::{Deserialize, Serialize}; -use std::{ - collections::{BTreeMap, BTreeSet}, - sync::Arc, -}; +use std::{collections::BTreeSet, sync::Arc}; use surrealdb::{engine::any::Any, Surreal}; -use tokio::sync::Semaphore; + +mod index_repo; pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { let mut processed_dids: BTreeSet = BTreeSet::new(); let (tx, rx) = async_channel::bounded(10_000); let state = Arc::new(SharedState { - rx: rx, + rx, db, http_client: Client::new(), - http_semaphore: Semaphore::new(1000), }); info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); @@ -106,11 +97,10 @@ struct BskyFollowRes { } #[derive(Debug)] -struct SharedState { +pub struct SharedState { rx: async_channel::Receiver, db: Surreal, http_client: Client, - http_semaphore: Semaphore, } async fn task_handler(state: Arc, task_id: u64) -> anyhow::Result<()> { @@ -152,150 +142,6 @@ async fn task_handler(state: Arc, task_id: u64) -> anyhow::Result<( } } -async fn index_repo(state: &Arc, did: &String) -> anyhow::Result<()> { - let did_key = crate::database::utils::did_to_key(did.as_str())?; - - { - let li: Option = state.db.select(("li_did", &did_key)).await?; - if li.is_some() { - // debug!("skip {}", did); - return Ok(()); - } - } - - let timestamp_us = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_micros(); - - let resp = state - .http_client - .get(format!("https://plc.directory/{}", did)) - .send() - .await? - .json::() - .await?; - - if let Some(service) = resp.service.first() { - let _permit = state.http_semaphore.acquire().await.unwrap(); - - let files: Vec<(ipld_core::cid::Cid, Vec)> = { - let custom_client = Client::new(); - let car_res = custom_client - .get(format!( - "{}/xrpc/com.atproto.sync.getRepo?did={}", - service.service_endpoint, did, - )) - .send() - .await?; - let car_res_bytes = car_res.bytes().await?; - - let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); - - let car_reader = CarReader::new(buf_reader).await?; - /* .bytes_stream() - .map_err(std::io::Error::other); */ - - //let reader = tokio_util::io::StreamReader::new(stream); - - //let reader = stream.into_async_read().compat(); - //let car_reader = CarReader::new(reader).await?; - - car_reader.stream().try_collect().await? - // drop(car_res); - }; - //drop(buf_reader); - - let mut map: BTreeMap> = BTreeMap::new(); - - for f in &files { - map.insert(f.0, &f.1); - } - - for file in &files { - let node_data_res = serde_ipld_dagcbor::from_reader::(&file.1[..]); - - if let Ok(node_data) = node_data_res { - let mut key = "".to_string(); - for e in node_data.e { - let k = String::from_utf8(e.k)?; - key = format!("{}{}", key.split_at(e.p as usize).0, k); - - let block = map.get(&e.v); - - if let Some(b) = block { - let record_res = serde_ipld_dagcbor::from_reader::(&b[..]); - if record_res.is_ok() { - let record = record_res.unwrap(); - - let mut parts = key.split("/"); - - let res = on_commit_event_createorupdate( - &state.db, - Did::new(did.clone()).unwrap(), - did_key.clone(), - parts.next().unwrap().to_string(), - RecordKey::new(parts.next().unwrap().to_string()).unwrap(), - record, - ) - .await; - if res.is_err() { - warn!( - "on_commit_event_createorupdate {} {}", - res.unwrap_err(), - did - ); - } - } - } - } - } - } - let _: Option = state - .db - .upsert(("li_did", did_key)) - .content(LastIndexedTimestamp { - time_us: timestamp_us as u64, - time_dt: chrono::Utc::now().into(), - error: None, - }) - .await?; - drop(_permit); - } - Ok(()) -} - -#[derive(Deserialize, Debug)] -struct PlcDirectoryDidResponse { - #[serde(rename = "alsoKnownAs")] - also_known_as: Vec, - service: Vec, -} - -#[derive(Deserialize, Debug)] -struct PlcDirectoryDidResponseService { - #[serde(rename = "serviceEndpoint")] - service_endpoint: String, - #[serde(rename = "type")] - type_: String, - id: String, -} - -#[derive(Deserialize, Debug)] -pub struct TreeEntry { - pub p: u8, - #[serde(with = "serde_bytes")] - pub k: Vec, - pub v: Cid, - pub t: Option, -} - -#[derive(Deserialize, Debug)] -pub struct NodeData { - pub l: Option, - pub e: Vec, -} - /// Database struct for a repo indexing timestamp #[derive(Debug, Serialize, Deserialize)] pub struct LastIndexedTimestamp { diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs new file mode 100644 index 0000000..f4c2028 --- /dev/null +++ b/src/database/repo_indexer/index_repo.rs @@ -0,0 +1,185 @@ +use super::{LastIndexedTimestamp, SharedState}; +use crate::database::{definitions::Record, handlers::on_commit_event_createorupdate}; +use atrium_api::{ + record::KnownRecord, + types::string::{Did, RecordKey}, +}; +use futures::TryStreamExt; +use ipld_core::cid::{Cid, CidGeneric}; +use iroh_car::CarReader; +use log::warn; +use reqwest::Client; +use serde::Deserialize; +use serde_ipld_dagcbor::from_reader; +use std::{ + collections::BTreeMap, + sync::{Arc, LazyLock}, +}; + +/// There should only be one request client to make use of connection pooling +// TODO: Dont use a global client +static REQWEST_CLIENT: LazyLock = LazyLock::new(|| Client::new()); + +#[derive(Deserialize, Debug)] +#[allow(dead_code)] +struct PlcDirectoryDidResponse { + #[serde(rename = "alsoKnownAs")] + also_known_as: Vec, + service: Vec, +} + +#[derive(Deserialize, Debug)] +#[allow(dead_code)] +struct PlcDirectoryDidResponseService { + #[serde(rename = "serviceEndpoint")] + service_endpoint: String, + #[serde(rename = "type")] + type_: String, + id: String, +} + +#[derive(Deserialize, Debug)] +#[allow(dead_code)] +/// https://atproto.com/specs/repository +pub struct TreeEntry { + #[serde(rename = "p")] + /// Count of bytes shared with previous TreeEntry in this Node (if any) + pub prefix_len: u8, + #[serde(with = "serde_bytes", rename = "k")] + /// Remainder of key for this TreeEntry, after "prefixlen" have been removed + pub key_suffix: Vec, + #[serde(rename = "v")] + /// Link to the record data (CBOR) for this entry + pub value: Cid, + #[serde(rename = "t")] + /// Link to a sub-tree Node at a lower level which has keys sorting after this TreeEntry's key (to the "right"), but before the next TreeEntry's key in this Node (if any) + pub tree: Option, +} + +#[derive(Deserialize, Debug)] +#[allow(dead_code)] +/// https://atproto.com/specs/repository +pub struct NodeData { + #[serde(rename = "l")] + /// Link to sub-tree Node on a lower level and with all keys sorting before keys at this node + pub left: Option, + #[serde(rename = "e")] + /// All the entries in the node + pub entries: Vec, +} + +/// Insert a file into a map +async fn insert_into_map( + mut files: BTreeMap>, + file: (CidGeneric<64>, Vec), +) -> anyhow::Result>> { + let (cid, data) = file; + files.insert(cid, data); + Ok(files) +} + +/// Get the contents of a repo with the given DID (Decentralized Identifier) +async fn get_files( + service: &PlcDirectoryDidResponseService, + did: &str, +) -> anyhow::Result>> { + let car_res = REQWEST_CLIENT + .get(format!( + "{}/xrpc/com.atproto.sync.getRepo?did={}", + service.service_endpoint, did, + )) + .send() + .await?; + let car_res_bytes = car_res.bytes().await?; + + let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); + + // TODO: Benchmark CarReader. This is probably not the right place for parsing logic + let car_reader = CarReader::new(buf_reader).await?; + + let files = car_reader + .stream() + .map_err(|e| e.into()) + .try_fold(BTreeMap::new(), insert_into_map) + .await; + + files +} + +/// Indexes the repo with the given DID (Decentralized Identifier) +pub async fn index_repo(state: &Arc, did: &String) -> anyhow::Result<()> { + let did_key = crate::database::utils::did_to_key(did.as_str())?; + + if state + .db + .select::>(("li_did", &did_key)) + .await? + .is_some() + { + // debug!("skip {}", did); + return Ok(()); + }; + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_micros(); + + let resp = state + .http_client + .get(format!("https://plc.directory/{}", did)) + .send() + .await? + .json::() + .await?; + + let Some(service) = resp.service.first() else { + return Ok(()); + }; + let files = get_files(service, did).await?; + + for file in &files { + let Ok(node_data) = from_reader::(&file.1[..]) else { + continue; + }; + let mut key = "".to_string(); + for entry in node_data.entries { + let k = String::from_utf8(entry.key_suffix)?; + key = format!("{}{}", key.split_at(entry.prefix_len as usize).0, k); + + let Some(block) = files.get(&entry.value) else { + continue; + }; + + let Ok(record) = from_reader::(&block[..]) else { + continue; + }; + + let mut parts = key.split("/"); + + let res = on_commit_event_createorupdate( + &state.db, + Did::new(did.clone()).unwrap(), + did_key.clone(), + parts.next().unwrap().to_string(), + RecordKey::new(parts.next().unwrap().to_string()).unwrap(), + record, + ) + .await; + + if let Err(error) = res { + warn!("on_commit_event_createorupdate {} {}", error, did); + } + } + } + let _: Option = state + .db + .upsert(("li_did", did_key)) + .content(LastIndexedTimestamp { + time_us: now as u64, + time_dt: chrono::Utc::now().into(), + error: None, + }) + .await?; + Ok(()) +} From 9c68917d497f6d6309c89faff6507a68b9430f54 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 20 Feb 2025 14:50:56 +0100 Subject: [PATCH 05/75] Refactor repo indexer --- src/database/repo_indexer.rs | 182 +++++++++++++++++------------------ 1 file changed, 87 insertions(+), 95 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index a5cd30a..9a72240 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,7 +1,8 @@ use crate::database::utils::unsafe_user_key_to_did; -use anyhow::Context; +use anyhow::{anyhow, Context}; +use async_channel::{Receiver, Sender}; use index_repo::index_repo; -use log::{debug, error, info, warn}; +use log::{error, info, warn}; use reqwest::Client; use serde::{Deserialize, Serialize}; use std::{collections::BTreeSet, sync::Arc}; @@ -9,8 +10,31 @@ use surrealdb::{engine::any::Any, Surreal}; mod index_repo; +#[derive(Deserialize)] +struct BskyFollowRes { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: surrealdb::RecordId, +} + +#[derive(Debug)] +pub struct SharedState { + rx: Receiver, + db: Surreal, + http_client: Client, +} + +/// Database struct for a repo indexing timestamp +#[derive(Debug, Serialize, Deserialize)] +pub struct LastIndexedTimestamp { + pub time_us: u64, + pub time_dt: surrealdb::Datetime, + pub error: Option, +} + pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { - let mut processed_dids: BTreeSet = BTreeSet::new(); let (tx, rx) = async_channel::bounded(10_000); let state = Arc::new(SharedState { @@ -21,58 +45,89 @@ pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyh info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); - for i in 0..max_tasks { + for thread_id in 0..max_tasks { let state = state.clone(); tokio::spawn(async move { - /* thread::Builder::new() - .name(format!("Indexer Thread {}", i)) - .spawn(move || { - Builder::new_current_thread() - .enable_io() - .enable_time() - .build() + let result = repo_fetcher_task(state).await; + let error = result + .and::<()>(Err(anyhow!("Handler thread should never exit"))) + .unwrap_err(); + error!(target: "indexer", "Handler thread {} failed: {:?}", thread_id, error); + }); + } + + repo_discovery_task(state, tx).await.unwrap(); + + Ok(()) +} + +async fn repo_fetcher_task(state: Arc) -> anyhow::Result<()> { + loop { + let did = state.rx.recv().await.unwrap(); + + let result = index_repo(&state, &did).await; + + if let Err(error) = result { + warn!(target: "indexer", "Failed to index repo {}: {}", did, error); + + let error_message = format!("{}", error); + if format!("{}", error) == "Failed to parse CAR file: early eof" { + // TODO: Document what this case does + + let did_key = crate::database::utils::did_to_key(did.as_str())?; + let timestamp_us = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) .unwrap() - .block_on(async { */ - let res = task_handler(state, i as u64).await; - if let Err(e) = res { - error!(target: "indexer", "Handler thread {} failed: {:?}", i, e); - } else { - debug!(target: "indexer", "Handler thread {} exited", i); + .as_micros(); + let _: Option = state + .db + .upsert(("li_did", did_key)) + .content(LastIndexedTimestamp { + time_us: timestamp_us as u64, + time_dt: chrono::Utc::now().into(), + error: Some(error_message), + }) + .await?; } - }); - /* }) - .context("Failed to spawn handler thread")?; */ + } } +} + +async fn repo_discovery_task(state: Arc, tx: Sender) -> anyhow::Result<()> { + // An ID that was used before the earliest data we are interested in + const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; - let mut anchor = "3juj4".to_string(); + let mut processed_dids: BTreeSet = BTreeSet::new(); + let mut anchor = OLDEST_USEFUL_ANCHOR.to_string(); loop { - info!(target: "repo_indexer", "anchor {}", anchor); + info!(target: "indexer", "anchor {}", anchor); - let mut res = state + let mut result = state .db .query(format!( "SELECT id,in,out FROM follow:{}.. LIMIT 500000;", anchor )) .await?; - let likes_res: Vec = res.take(0)?; + let follows: Vec = result.take(0)?; - if likes_res.is_empty() { + if follows.is_empty() { tokio::time::sleep(std::time::Duration::from_millis(10000)).await; continue; } - anchor = format!("{}", likes_res.last().unwrap().id.key()); + anchor = format!("{}", follows.last().unwrap().id.key()); let mut dids: BTreeSet = BTreeSet::new(); - for like in likes_res { - for record_id in vec![like.from, like.to] { + for follow in &follows { + for record_id in [&follow.from, &follow.to] { let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); - if !processed_dids.contains(&did) { - dids.insert(did.clone()); - processed_dids.insert(did); + if processed_dids.contains(&did) { + continue; } + dids.insert(did.clone()); + processed_dids.insert(did); } } @@ -82,70 +137,7 @@ pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyh .context("Failed to send message to handler thread")?; } + // TODO: Remove and add proper backpressure tokio::time::sleep(std::time::Duration::from_millis(10)).await; } } - -#[derive(Deserialize)] -struct BskyFollowRes { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - - pub id: surrealdb::RecordId, -} - -#[derive(Debug)] -pub struct SharedState { - rx: async_channel::Receiver, - db: Surreal, - http_client: Client, -} - -async fn task_handler(state: Arc, task_id: u64) -> anyhow::Result<()> { - tokio::time::sleep(std::time::Duration::from_millis(task_id * 42)).await; - // loop infinitely, handling repo index tasks - loop { - // get the next repo to be indexed - let did = { - let x = state.rx.recv().await; - x.unwrap() - }; - - let res = index_repo(&state, &did).await; - if let Err(e) = res { - let e_str = format!("{}", e); - if e_str == "Failed to parse CAR file: early eof" { - let did_key = crate::database::utils::did_to_key(did.as_str())?; - let timestamp_us = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_micros(); - let _: Option = state - .db - .upsert(("li_did", did_key)) - .content(LastIndexedTimestamp { - time_us: timestamp_us as u64, - time_dt: chrono::Utc::now().into(), - error: Some(e_str), - }) - .await?; - } else { - warn!(target: "indexer", "Failed to index repo {}: {}", did, e); - } - /* match e.try_into() { - iroh_car::Error::Parsing(e) => {} - _ => {} - } */ - } - } -} - -/// Database struct for a repo indexing timestamp -#[derive(Debug, Serialize, Deserialize)] -pub struct LastIndexedTimestamp { - pub time_us: u64, - pub time_dt: surrealdb::Datetime, - pub error: Option, -} From 7ef01f308eb10600e59549b86756bafbe294c7b1 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 20 Feb 2025 15:41:43 +0100 Subject: [PATCH 06/75] Properly manage the queue of the discovery task --- src/database/repo_indexer.rs | 77 +++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 9a72240..05cfaec 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -5,8 +5,9 @@ use index_repo::index_repo; use log::{error, info, warn}; use reqwest::Client; use serde::{Deserialize, Serialize}; -use std::{collections::BTreeSet, sync::Arc}; +use std::{collections::HashSet, sync::Arc, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; +use tokio::time::sleep; mod index_repo; @@ -34,9 +35,19 @@ pub struct LastIndexedTimestamp { pub error: Option, } -pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { - let (tx, rx) = async_channel::bounded(10_000); +/// Size of the queue of discovered repos +const QUEUE_SIZE: usize = 200; +/// How long to sleep, when the discovery queue is full +const DISCOVERY_QUEUE_FULL_BACKOFF: Duration = Duration::from_millis(500); +/// How long to sleep, when the we caught up with discovery +const DISCOVERY_CAUGHT_UP_BACKOFF: Duration = Duration::from_millis(500); +/// An ID that was used before the earliest data we are interested in +const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; +/// How long to sleep, when the queue is empty +const FETCHER_BACKOFF: Duration = Duration::from_millis(500); +pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { + let (tx, rx) = async_channel::bounded(QUEUE_SIZE); let state = Arc::new(SharedState { rx, db, @@ -63,7 +74,10 @@ pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyh async fn repo_fetcher_task(state: Arc) -> anyhow::Result<()> { loop { - let did = state.rx.recv().await.unwrap(); + let Ok(did) = state.rx.try_recv() else { + sleep(FETCHER_BACKOFF).await; + continue; + }; let result = index_repo(&state, &did).await; @@ -94,50 +108,59 @@ async fn repo_fetcher_task(state: Arc) -> anyhow::Result<()> { } async fn repo_discovery_task(state: Arc, tx: Sender) -> anyhow::Result<()> { - // An ID that was used before the earliest data we are interested in - const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; - - let mut processed_dids: BTreeSet = BTreeSet::new(); + // Max number of items in the queue + let queue_capacity = tx.capacity().unwrap_or(QUEUE_SIZE); + assert!(queue_capacity > 3); + // Start fetching when the queue is emptier than this + let fetch_threshold = (((queue_capacity / 3) * 2) - 1).max(1); + // How many items to fetch at once + let fetch_amount = (queue_capacity / 3).max(1); + // Start printing a warning when the queue is too empty + let warning_threshold = queue_capacity / 3; + + let mut processed_dids: HashSet = HashSet::new(); let mut anchor = OLDEST_USEFUL_ANCHOR.to_string(); loop { - info!(target: "indexer", "anchor {}", anchor); + if tx.len() > fetch_threshold { + sleep(DISCOVERY_QUEUE_FULL_BACKOFF).await; + continue; + } + info!(target: "indexer", "Discovering follows starting from {}", anchor); let mut result = state .db + // TODO: Fix the possible SQL injection .query(format!( - "SELECT id,in,out FROM follow:{}.. LIMIT 500000;", - anchor + "SELECT id,in,out FROM follow:{}.. LIMIT {};", + anchor, fetch_amount )) .await?; let follows: Vec = result.take(0)?; - if follows.is_empty() { - tokio::time::sleep(std::time::Duration::from_millis(10000)).await; + let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { + sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; continue; - } - - anchor = format!("{}", follows.last().unwrap().id.key()); - - let mut dids: BTreeSet = BTreeSet::new(); + }; + anchor = format!("{}", anchor_key); + let processed_dids_before = processed_dids.len(); for follow in &follows { for record_id in [&follow.from, &follow.to] { let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); if processed_dids.contains(&did) { continue; } - dids.insert(did.clone()); - processed_dids.insert(did); + processed_dids.insert(did.clone()); + tx.send(did) + .await + .context("Failed to send message to handler thread")?; } } - for did in dids { - tx.send(did) - .await - .context("Failed to send message to handler thread")?; + // Warn if it looks like the queue size or the backoff were choosen incorrectly + let new_follows = processed_dids.len() - processed_dids_before; + if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { + warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); } - - // TODO: Remove and add proper backpressure - tokio::time::sleep(std::time::Duration::from_millis(10)).await; } } From d1fae9b0d625f140383457c07a432f5eb9626bb0 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Fri, 21 Feb 2025 11:13:17 +0100 Subject: [PATCH 07/75] Work on index repo performance --- src/database/repo_indexer/index_repo.rs | 113 ++++++++++++++++-------- 1 file changed, 77 insertions(+), 36 deletions(-) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index f4c2028..7d9ad2c 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -13,8 +13,11 @@ use serde::Deserialize; use serde_ipld_dagcbor::from_reader; use std::{ collections::BTreeMap, + string::FromUtf8Error, sync::{Arc, LazyLock}, }; +use tokio::task::block_in_place; +use tokio_util::io::StreamReader; /// There should only be one request client to make use of connection pooling // TODO: Dont use a global client @@ -68,6 +71,12 @@ pub struct NodeData { pub entries: Vec, } +struct DatabaseUpdate { + collection: String, + rkey: RecordKey, + record: KnownRecord, +} + /// Insert a file into a map async fn insert_into_map( mut files: BTreeMap>, @@ -90,13 +99,17 @@ async fn get_files( )) .send() .await?; - let car_res_bytes = car_res.bytes().await?; + let bytes_stream = car_res + .bytes_stream() + .map_err(|error| std::io::Error::new(std::io::ErrorKind::Other, error)); + let reader = StreamReader::new(bytes_stream); + // TODO: Figure out what the second parameter does + // let reader = rs_car_sync::CarReader::new(&car_res_bytes, false); - let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); + // let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); // TODO: Benchmark CarReader. This is probably not the right place for parsing logic - let car_reader = CarReader::new(buf_reader).await?; - + let car_reader = CarReader::new(reader).await?; let files = car_reader .stream() .map_err(|e| e.into()) @@ -106,6 +119,52 @@ async fn get_files( files } +fn files_to_updates( + files: BTreeMap>, +) -> Result, FromUtf8Error> { + // TODO: Understand this logic and whether this can be done streaming + let mut result = Vec::new(); + for file in &files { + let Ok(node_data) = from_reader::(&file.1[..]) else { + continue; + }; + let mut key = "".to_string(); + for entry in node_data.entries { + let k = String::from_utf8(entry.key_suffix)?; + key = format!("{}{}", key.split_at(entry.prefix_len as usize).0, k); + + let Some(block) = files.get(&entry.value) else { + continue; + }; + + let Ok(record) = from_reader::(&block[..]) else { + continue; + }; + + let mut parts = key.split("/"); + + let update = DatabaseUpdate { + collection: parts.next().unwrap().to_string(), + rkey: RecordKey::new(parts.next().unwrap().to_string()).unwrap(), + record, + }; + result.push(update); + // let res = on_commit_event_createorupdate( + // db, + // Did::new(did.clone()).unwrap(), + // did_key.clone(), + // parts.next().unwrap().to_string(), + // RecordKey::new(parts.next().unwrap().to_string()).unwrap(), + // record, + // ) + // if let Err(error) = res { + // warn!("on_commit_event_createorupdate {} {}", error, did); + // } + } + } + return Ok(result); +} + /// Indexes the repo with the given DID (Decentralized Identifier) pub async fn index_repo(state: &Arc, did: &String) -> anyhow::Result<()> { let did_key = crate::database::utils::did_to_key(did.as_str())?; @@ -137,39 +196,21 @@ pub async fn index_repo(state: &Arc, did: &String) -> anyhow::Resul return Ok(()); }; let files = get_files(service, did).await?; + let updates = block_in_place(|| files_to_updates(files))?; + for update in updates { + // TODO: Figure out what this does and whether we can batch these updates + let res = on_commit_event_createorupdate( + &state.db, + Did::new(did.clone()).unwrap(), + did_key.clone(), + update.collection, + update.rkey, + update.record, + ) + .await; - for file in &files { - let Ok(node_data) = from_reader::(&file.1[..]) else { - continue; - }; - let mut key = "".to_string(); - for entry in node_data.entries { - let k = String::from_utf8(entry.key_suffix)?; - key = format!("{}{}", key.split_at(entry.prefix_len as usize).0, k); - - let Some(block) = files.get(&entry.value) else { - continue; - }; - - let Ok(record) = from_reader::(&block[..]) else { - continue; - }; - - let mut parts = key.split("/"); - - let res = on_commit_event_createorupdate( - &state.db, - Did::new(did.clone()).unwrap(), - did_key.clone(), - parts.next().unwrap().to_string(), - RecordKey::new(parts.next().unwrap().to_string()).unwrap(), - record, - ) - .await; - - if let Err(error) = res { - warn!("on_commit_event_createorupdate {} {}", error, did); - } + if let Err(error) = res { + warn!("on_commit_event_createorupdate {} {}", error, did); } } let _: Option = state From e5756d5dccd445e55f8ceb3d4d8f512020cb472e Mon Sep 17 00:00:00 2001 From: Zebreus Date: Mon, 24 Feb 2025 16:13:38 +0100 Subject: [PATCH 08/75] Add tokio-console instrumentation --- .cargo/config.toml | 2 ++ Cargo.toml | 1 + README.md | 6 ++++++ flake.nix | 2 ++ src/main.rs | 1 + 5 files changed, 12 insertions(+) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..bff29e6 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["--cfg", "tokio_unstable"] diff --git a/Cargo.toml b/Cargo.toml index db3fbc3..f96bfba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ futures = "0.3.31" serde_ipld_dagcbor = "0.6.1" serde_bytes = "0.11.15" async-channel = "2.3.1" +console-subscriber = "0.4.1" [profile.release] lto = false diff --git a/README.md b/README.md index 00f674d..08e0c2d 100644 --- a/README.md +++ b/README.md @@ -14,3 +14,9 @@ The database can then be used to run powerful queries on the network data or bui 4. Launch the indexer with `./target/release/skyfeed-indexer [--help]`. You may need to increase the ulimit for the number of open files. You can do this by running `ulimit -n 1000000`. + +## Debugging and profiling + +### tokio + +You can use tokio-console to get more insights into what the tokio tasks are currently doing. Just run `tokio-console` while the indexer is running. diff --git a/flake.nix b/flake.nix index bba4138..e56f0ac 100644 --- a/flake.nix +++ b/flake.nix @@ -58,6 +58,8 @@ pkgs.openssl pkgs.pkg-config pkgs.clang + + pkgs.tokio-console ]; shellHook = '' diff --git a/src/main.rs b/src/main.rs index d1dfb77..fce0bd6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,6 +19,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; /// Entry point for the application fn main() { + console_subscriber::init(); // parse command line arguments let args = config::parse_args(); From 5fed4d8a995689d0051b04060ada3c8933fe8e45 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Mon, 24 Feb 2025 17:31:21 +0100 Subject: [PATCH 09/75] Convert the repo indexer to use a stream --- src/database/repo_indexer.rs | 261 +++++++------- src/database/repo_indexer/index_repo.rs | 339 +++++++++++++----- src/database/repo_indexer/repo_stream.rs | 146 ++++++++ .../repo_indexer/repo_stream_nofuture.rs | 146 ++++++++ 4 files changed, 678 insertions(+), 214 deletions(-) create mode 100644 src/database/repo_indexer/repo_stream.rs create mode 100644 src/database/repo_indexer/repo_stream_nofuture.rs diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 05cfaec..3b7a806 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,15 +1,13 @@ -use crate::database::utils::unsafe_user_key_to_did; -use anyhow::{anyhow, Context}; -use async_channel::{Receiver, Sender}; -use index_repo::index_repo; -use log::{error, info, warn}; +use futures::StreamExt; +use index_repo::PipelineItem; +use log::{error, info}; +use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; -use std::{collections::HashSet, sync::Arc, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; -use tokio::time::sleep; mod index_repo; +mod repo_stream; #[derive(Deserialize)] struct BskyFollowRes { @@ -20,13 +18,6 @@ struct BskyFollowRes { pub id: surrealdb::RecordId, } -#[derive(Debug)] -pub struct SharedState { - rx: Receiver, - db: Surreal, - http_client: Client, -} - /// Database struct for a repo indexing timestamp #[derive(Debug, Serialize, Deserialize)] pub struct LastIndexedTimestamp { @@ -35,132 +26,136 @@ pub struct LastIndexedTimestamp { pub error: Option, } -/// Size of the queue of discovered repos -const QUEUE_SIZE: usize = 200; -/// How long to sleep, when the discovery queue is full -const DISCOVERY_QUEUE_FULL_BACKOFF: Duration = Duration::from_millis(500); -/// How long to sleep, when the we caught up with discovery -const DISCOVERY_CAUGHT_UP_BACKOFF: Duration = Duration::from_millis(500); /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; -/// How long to sleep, when the queue is empty -const FETCHER_BACKOFF: Duration = Duration::from_millis(500); - -pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { - let (tx, rx) = async_channel::bounded(QUEUE_SIZE); - let state = Arc::new(SharedState { - rx, - db, - http_client: Client::new(), - }); - - info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); +/// The size of the buffer between each pipeline stage in elements +const BUFFER_SIZE: usize = 200; - for thread_id in 0..max_tasks { - let state = state.clone(); - tokio::spawn(async move { - let result = repo_fetcher_task(state).await; - let error = result - .and::<()>(Err(anyhow!("Handler thread should never exit"))) - .unwrap_err(); - error!(target: "indexer", "Handler thread {} failed: {:?}", thread_id, error); - }); - } - - repo_discovery_task(state, tx).await.unwrap(); - - Ok(()) +pub async fn to_async(did: String) -> String { + println!("did: {}", did); + did } -async fn repo_fetcher_task(state: Arc) -> anyhow::Result<()> { - loop { - let Ok(did) = state.rx.try_recv() else { - sleep(FETCHER_BACKOFF).await; - continue; - }; - - let result = index_repo(&state, &did).await; - - if let Err(error) = result { - warn!(target: "indexer", "Failed to index repo {}: {}", did, error); +pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { + let http_client = Client::new(); - let error_message = format!("{}", error); - if format!("{}", error) == "Failed to parse CAR file: early eof" { - // TODO: Document what this case does + info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); - let did_key = crate::database::utils::did_to_key(did.as_str())?; - let timestamp_us = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_micros(); - let _: Option = state - .db - .upsert(("li_did", did_key)) - .content(LastIndexedTimestamp { - time_us: timestamp_us as u64, - time_dt: chrono::Utc::now().into(), - error: Some(error_message), - }) - .await?; + RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) + .map(to_async) + .buffer_unordered(BUFFER_SIZE) + .map(|x| { + println!("gonna process {}", x); + x + }) + .map(|did| { + let db = &db; + let http_client = &http_client; + let item = PipelineItem::new(db, http_client, did); + item + }) + .map(|item| async { item.check_indexed().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); } - } - } -} - -async fn repo_discovery_task(state: Arc, tx: Sender) -> anyhow::Result<()> { - // Max number of items in the queue - let queue_capacity = tx.capacity().unwrap_or(QUEUE_SIZE); - assert!(queue_capacity > 3); - // Start fetching when the queue is emptier than this - let fetch_threshold = (((queue_capacity / 3) * 2) - 1).max(1); - // How many items to fetch at once - let fetch_amount = (queue_capacity / 3).max(1); - // Start printing a warning when the queue is too empty - let warning_threshold = queue_capacity / 3; - - let mut processed_dids: HashSet = HashSet::new(); - let mut anchor = OLDEST_USEFUL_ANCHOR.to_string(); - loop { - if tx.len() > fetch_threshold { - sleep(DISCOVERY_QUEUE_FULL_BACKOFF).await; - continue; - } - - info!(target: "indexer", "Discovering follows starting from {}", anchor); - let mut result = state - .db - // TODO: Fix the possible SQL injection - .query(format!( - "SELECT id,in,out FROM follow:{}.. LIMIT {};", - anchor, fetch_amount - )) - .await?; - let follows: Vec = result.take(0)?; - - let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { - sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; - continue; - }; - anchor = format!("{}", anchor_key); - - let processed_dids_before = processed_dids.len(); - for follow in &follows { - for record_id in [&follow.from, &follow.to] { - let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); - if processed_dids.contains(&did) { - continue; - } - processed_dids.insert(did.clone()); - tx.send(did) - .await - .context("Failed to send message to handler thread")?; + result.ok() + }) + .map(|item| async { item.get_service().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); } - } - - // Warn if it looks like the queue size or the backoff were choosen incorrectly - let new_follows = processed_dids.len() - processed_dids_before; - if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { - warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); - } - } + result.ok() + }) + .map(|item| async { item.download_repo().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); + } + result.ok() + }) + .map(|item| async { item.deserialize_repo().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); + } + result.ok() + }) + .map(|item| async { item.files_to_updates().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); + } + result.ok() + }) + .map(|item| async { item.apply_updates().await }) + .buffer_unordered(BUFFER_SIZE) + .filter_map(|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); + } + result.ok() + }) + .for_each(|x| async { + x.print_report().await; + }) + .await; + + panic!("Done, this should not happen"); + + // .map(|did| { + // // let state = state.clone(); + // let db = &db; + // let client = &client; + // async move { + // let result = index_repo(&db, &client, &did).await; + // if let Err(error) = result { + // warn!(target: "indexer", "Failed to index repo {}: {}", did, error); + + // let error_message = format!("{}", error); + // if format!("{}", error) == "Failed to parse CAR file: early eof" { + // // TODO: Document what this case does + + // let did_key = crate::database::utils::did_to_key(did.as_str()).unwrap(); + // let timestamp_us = std::time::SystemTime::now() + // .duration_since(std::time::UNIX_EPOCH) + // .unwrap() + // .as_micros(); + // let _: Option = db + // .upsert(("li_did", did_key)) + // .content(LastIndexedTimestamp { + // time_us: timestamp_us as u64, + // time_dt: chrono::Utc::now().into(), + // error: Some(error_message), + // }) + // .await + // .unwrap(); + // } + // } + // } + // }) + // .buffer_unordered(200) + // .for_each(|x| async { + // println!("finished stream"); + // }) + // .await; + + // for thread_id in 0..max_tasks { + // let state = state.clone(); + // tokio::spawn(async move { + // let result = repo_fetcher_task(state).await; + // let error = result + // .and::<()>(Err(anyhow!("Handler thread should never exit"))) + // .unwrap_err(); + // error!(target: "indexer", "Handler thread {} failed: {:?}", thread_id, error); + // }); + // } + + // repo_discovery_task(state, tx).await.unwrap(); } diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 7d9ad2c..7fc43b9 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -1,23 +1,20 @@ -use super::{LastIndexedTimestamp, SharedState}; +use super::LastIndexedTimestamp; use crate::database::{definitions::Record, handlers::on_commit_event_createorupdate}; use atrium_api::{ record::KnownRecord, types::string::{Did, RecordKey}, }; use futures::TryStreamExt; +use hyper::body::Bytes; use ipld_core::cid::{Cid, CidGeneric}; use iroh_car::CarReader; use log::warn; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{ - collections::BTreeMap, - string::FromUtf8Error, - sync::{Arc, LazyLock}, -}; -use tokio::task::block_in_place; -use tokio_util::io::StreamReader; +use std::{collections::BTreeMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; +use surrealdb::{engine::any::Any, Surreal}; +use tokio::task::spawn_blocking; /// There should only be one request client to make use of connection pooling // TODO: Dont use a global client @@ -87,39 +84,8 @@ async fn insert_into_map( Ok(files) } -/// Get the contents of a repo with the given DID (Decentralized Identifier) -async fn get_files( - service: &PlcDirectoryDidResponseService, - did: &str, -) -> anyhow::Result>> { - let car_res = REQWEST_CLIENT - .get(format!( - "{}/xrpc/com.atproto.sync.getRepo?did={}", - service.service_endpoint, did, - )) - .send() - .await?; - let bytes_stream = car_res - .bytes_stream() - .map_err(|error| std::io::Error::new(std::io::ErrorKind::Other, error)); - let reader = StreamReader::new(bytes_stream); - // TODO: Figure out what the second parameter does - // let reader = rs_car_sync::CarReader::new(&car_res_bytes, false); - - // let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); - - // TODO: Benchmark CarReader. This is probably not the right place for parsing logic - let car_reader = CarReader::new(reader).await?; - let files = car_reader - .stream() - .map_err(|e| e.into()) - .try_fold(BTreeMap::new(), insert_into_map) - .await; - - files -} - -fn files_to_updates( +/// Convert downloaded files into database updates. Blocks the thread +fn files_to_updates_blocking( files: BTreeMap>, ) -> Result, FromUtf8Error> { // TODO: Understand this logic and whether this can be done streaming @@ -149,59 +115,93 @@ fn files_to_updates( record, }; result.push(update); - // let res = on_commit_event_createorupdate( - // db, - // Did::new(did.clone()).unwrap(), - // did_key.clone(), - // parts.next().unwrap().to_string(), - // RecordKey::new(parts.next().unwrap().to_string()).unwrap(), - // record, - // ) - // if let Err(error) = res { - // warn!("on_commit_event_createorupdate {} {}", error, did); - // } } } return Ok(result); } -/// Indexes the repo with the given DID (Decentralized Identifier) -pub async fn index_repo(state: &Arc, did: &String) -> anyhow::Result<()> { - let did_key = crate::database::utils::did_to_key(did.as_str())?; +/// Check if a repo is already indexed +async fn check_indexed(db: &Surreal, did: &str) -> anyhow::Result { + let did_key = crate::database::utils::did_to_key(did)?; - if state - .db + Ok(db .select::>(("li_did", &did_key)) .await? - .is_some() - { - // debug!("skip {}", did); - return Ok(()); - }; - - let now = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_micros(); - - let resp = state - .http_client + .is_some()) +} + +/// Get the plc response service for the repo +async fn get_plc_service( + http_client: &Client, + did: &str, +) -> anyhow::Result> { + let resp = http_client .get(format!("https://plc.directory/{}", did)) .send() .await? .json::() .await?; + let service = resp.service.into_iter().next(); + Ok(service) +} + +/// Download a repo from the given service +async fn download_repo( + service: &PlcDirectoryDidResponseService, + did: &str, +) -> anyhow::Result { + let get_repo_response = REQWEST_CLIENT + .get(format!( + "{}/xrpc/com.atproto.sync.getRepo?did={}", + service.service_endpoint, did, + )) + .send() + .await?; + let bytes = get_repo_response.bytes().await?; + return Ok(bytes); +} + +/// Download the file for the given repo into a map +async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { + // let reader = StreamReader::new(bytes.as_ref()); + // TODO: Figure out what the second parameter does + // let reader = rs_car_sync::CarReader::new(&car_res_bytes, false); + + // let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); + + // TODO: Benchmark CarReader. This is probably not the right place for parsing logic + let car_reader = CarReader::new(bytes.as_ref()).await?; + let files = car_reader + .stream() + .map_err(|e| e.into()) + .try_fold(BTreeMap::new(), insert_into_map) + .await; + + files +} + +/// Convert downloaded files into database updates +async fn files_to_updates( + files: BTreeMap>, +) -> anyhow::Result> { + // TODO: Look into using block_in_place instead of spawn_blocking + let result = spawn_blocking(|| files_to_updates_blocking(files)).await??; + Ok(result) +} + +/// Apply updates to the database +async fn apply_updates( + db: &Surreal, + did: &str, + updates: Vec, + update_timestamp: &Duration, +) -> anyhow::Result<()> { + let did_key = crate::database::utils::did_to_key(did)?; - let Some(service) = resp.service.first() else { - return Ok(()); - }; - let files = get_files(service, did).await?; - let updates = block_in_place(|| files_to_updates(files))?; for update in updates { - // TODO: Figure out what this does and whether we can batch these updates let res = on_commit_event_createorupdate( - &state.db, - Did::new(did.clone()).unwrap(), + db, + Did::new(did.into()).unwrap(), did_key.clone(), update.collection, update.rkey, @@ -213,14 +213,191 @@ pub async fn index_repo(state: &Arc, did: &String) -> anyhow::Resul warn!("on_commit_event_createorupdate {} {}", error, did); } } - let _: Option = state - .db + let _: Option = db .upsert(("li_did", did_key)) .content(LastIndexedTimestamp { - time_us: now as u64, + time_us: update_timestamp.as_micros() as u64, time_dt: chrono::Utc::now().into(), error: None, }) .await?; Ok(()) } + +// /// Indexes the repo with the given DID (Decentralized Identifier) +// async fn index_repo(db: &Surreal, http_client: &Client, did: &String) -> anyhow::Result<()> { +// { +// if check_indexed(&db, &did).await? { +// return Ok(()); +// } +// } + +// let now = std::time::SystemTime::now() +// .duration_since(std::time::UNIX_EPOCH) +// .unwrap(); + +// let service = { +// let Some(service) = get_plc_service(&http_client, &did).await? else { +// return Ok(()); +// }; +// service +// }; + +// let repo = { download_repo(&service, &did).await? }; +// let files = { deserialize_repo(repo).await? }; + +// let updates = { files_to_updates(files).await? }; +// let update_result = { apply_updates(&db, &did, updates, &now).await? }; +// Ok(()) +// } + +/// No processing has been done on this item +pub struct New {} + +/// It was verified that the item is not indexed yet +pub struct NotIndexed {} +/// Has a service +pub struct WithService { + service: PlcDirectoryDidResponseService, + // TODO: Figure out why now is created this early + now: std::time::Duration, +} +/// Has files +pub struct WithRepo { + now: std::time::Duration, + repo: Bytes, +} + +pub struct WithFiles { + now: std::time::Duration, + files: BTreeMap>, +} +/// Has converted the files to update +pub struct WithUpdates { + now: std::time::Duration, + updates: Vec, +} +/// Updates have been applied +pub struct Done {} + +pub struct PipelineItem<'a, State> { + db: &'a Surreal, + http_client: &'a Client, + did: String, + state: State, +} + +impl<'a> PipelineItem<'a, New> { + pub fn new( + db: &'a Surreal, + http_client: &'a Client, + did: String, + ) -> PipelineItem<'a, New> { + PipelineItem::<'a, New> { + db, + http_client, + did, + state: New {}, + } + } +} + +impl<'a> PipelineItem<'a, New> { + pub async fn check_indexed(self) -> anyhow::Result> { + if check_indexed(&self.db, &self.did).await? { + // TODO: Handle this better, as this is not really an error + return Err(anyhow::anyhow!("Already indexed")); + } + Ok(PipelineItem::<'a, NotIndexed> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: NotIndexed {}, + }) + } +} + +impl<'a> PipelineItem<'a, NotIndexed> { + pub async fn get_service(self) -> anyhow::Result> { + let service = get_plc_service(&self.http_client, &self.did).await?; + let Some(service) = service else { + // TODO: Handle this better, as this is not really an error + return Err(anyhow::anyhow!("Failed to get a plc service")); + }; + Ok(PipelineItem::<'a, WithService> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: WithService { + service: service, + now: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap(), + }, + }) + } +} + +impl<'a> PipelineItem<'a, WithService> { + pub async fn download_repo(self) -> anyhow::Result> { + let repo = download_repo(&self.state.service, &self.did).await?; + Ok(PipelineItem::<'a, WithRepo> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: WithRepo { + now: self.state.now, + repo, + }, + }) + } +} + +impl<'a> PipelineItem<'a, WithRepo> { + pub async fn deserialize_repo(self) -> anyhow::Result> { + let files = deserialize_repo(self.state.repo).await?; + Ok(PipelineItem::<'a, WithFiles> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: WithFiles { + now: self.state.now, + files, + }, + }) + } +} + +impl<'a> PipelineItem<'a, WithFiles> { + pub async fn files_to_updates(self) -> anyhow::Result> { + let updates = files_to_updates(self.state.files).await?; + Ok(PipelineItem::<'a, WithUpdates> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: WithUpdates { + now: self.state.now, + updates, + }, + }) + } +} + +impl<'a> PipelineItem<'a, WithUpdates> { + pub async fn apply_updates(self) -> anyhow::Result> { + apply_updates(&self.db, &self.did, self.state.updates, &self.state.now).await?; + Ok(PipelineItem::<'a, Done> { + db: self.db, + http_client: self.http_client, + did: self.did, + state: Done {}, + }) + } +} + +impl<'a> PipelineItem<'a, Done> { + pub async fn print_report(self) -> () { + // TODO: This is only for printing debug stuff + println!("Indexed {}", self.did); + } +} diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs new file mode 100644 index 0000000..5b91dff --- /dev/null +++ b/src/database/repo_indexer/repo_stream.rs @@ -0,0 +1,146 @@ +use std::{ + collections::{HashSet, VecDeque}, + future::{Future, IntoFuture}, + task::Poll, +}; + +use futures::Stream; +use log::info; +use surrealdb::{engine::any::Any, Surreal}; + +use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; + +pub struct RepoStream<'a> { + buffer: VecDeque, + processed_dids: HashSet, + anchor: String, + db: &'a Surreal, + db_future: Option< + std::pin::Pin< + Box< + dyn Future> + + Send + + Sync + + 'a, + >, + >, + >, +} + +impl<'a> RepoStream<'a> { + pub fn new(anchor: String, db: &'a Surreal) -> Self { + return Self { + buffer: VecDeque::new(), + processed_dids: HashSet::new(), + anchor, + db, + db_future: None, + }; + } +} + +const FETCH_AMOUNT: usize = 100; + +// async fn get_repos_from(db: &Surreal, anchor: &str) -> Vec { +// info!(target: "indexer", "Discovering follows starting from {}", anchor); +// let mut result = db +// // TODO: Fix the possible SQL injection +// .query(format!( +// "SELECT id,in,out FROM follow:{}.. LIMIT {};", +// anchor, FETCH_AMOUNT +// )); +// let follows: Vec = result.take(0)?; + +// let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { +// sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; +// continue; +// }; +// } + +impl<'a> Stream for RepoStream<'a> { + type Item = String; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + loop { + if let Some(next) = self.buffer.pop_front() { + return Poll::Ready(Some(next)); + } + + info!(target: "indexer", "Discovering follows starting from {}", self.anchor); + let db_future = if self.db_future.is_some() { + self.db_future.as_mut().unwrap() + } else { + let result = self + .db + // TODO: Fix the possible SQL injection + .query(format!( + "SELECT id,in,out FROM follow:{}.. LIMIT {};", + self.anchor, FETCH_AMOUNT + )); + // let mut future: std::pin::Pin< + // Box< + // dyn Future> + // + Send + // + Sync + // + 'a, + // >, + // > + let future = result.into_future(); + self.db_future = Some(future); + self.db_future.as_mut().unwrap() + }; + + let Poll::Ready(result) = Future::poll(db_future.as_mut(), cx) else { + return Poll::Pending; + }; + self.db_future = None; + + let mut result = result.unwrap(); + + // let mut result: surrealdb::method::Query<'_, Any> = self + // .db + // // TODO: Fix the possible SQL injection + // .query(format!( + // "SELECT id,in,out FROM follow:{}.. LIMIT {};", + // self.anchor, FETCH_AMOUNT + // )); + let follows: Vec = result.take(0).unwrap(); + + let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { + // sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; + // continue; + // TODO: Sleep again + return Poll::Pending; + }; + self.anchor = format!("{}", anchor_key); + + for follow in &follows { + for record_id in [&follow.from, &follow.to] { + let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); + if self.processed_dids.contains(&did) { + continue; + } + self.processed_dids.insert(did.clone()); + self.buffer.push_back(did); + // tx.send(did) + // .await + // .context("Failed to send message to handler thread")?; + } + } + + if let Some(next) = self.buffer.pop_front() { + return Poll::Ready(Some(next)); + } + return Poll::Pending; + + // Warn if it looks like the queue size or the backoff were choosen incorrectly + // let new_follows = self.processed_dids.len() - processed_dids_before; + // if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { + // warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); + // } + } + } +} diff --git a/src/database/repo_indexer/repo_stream_nofuture.rs b/src/database/repo_indexer/repo_stream_nofuture.rs new file mode 100644 index 0000000..928bb18 --- /dev/null +++ b/src/database/repo_indexer/repo_stream_nofuture.rs @@ -0,0 +1,146 @@ +use std::{ + collections::{HashSet, VecDeque}, + future::{Future, IntoFuture}, + task::Poll, +}; + +use futures::Stream; +use log::info; +use surrealdb::{engine::any::Any, Surreal}; + +use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; + +pub struct RepoStream<'a> { + buffer: VecDeque, + processed_dids: HashSet, + anchor: String, + db: &'a Surreal, + db_future: Option< + std::pin::Pin< + Box< + dyn Future> + + Send + + Sync + + 'a, + >, + >, + >, +} + +impl<'a> RepoStream<'a> { + pub fn new(anchor: String, db: &'a Surreal) -> Self { + return Self { + buffer: VecDeque::new(), + processed_dids: HashSet::new(), + anchor, + db, + db_future: None, + }; + } +} + +const FETCH_AMOUNT: usize = 100; + +// async fn get_repos_from(db: &Surreal, anchor: &str) -> Vec { +// info!(target: "indexer", "Discovering follows starting from {}", anchor); +// let mut result = db +// // TODO: Fix the possible SQL injection +// .query(format!( +// "SELECT id,in,out FROM follow:{}.. LIMIT {};", +// anchor, FETCH_AMOUNT +// )); +// let follows: Vec = result.take(0)?; + +// let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { +// sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; +// continue; +// }; +// } + +impl<'a> Stream for RepoStream<'a> { + type Item = String; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + loop { + if let Some(next) = self.buffer.pop_front() { + return Poll::Ready(Some(next)); + } + + info!(target: "indexer", "Discovering follows starting from {}", self.anchor); + let db_future = if self.db_future.is_some() { + self.db_future.as_mut().unwrap() + } else { + let mut result = self + .db + // TODO: Fix the possible SQL injection + .query(format!( + "SELECT id,in,out FROM follow:{}.. LIMIT {};", + self.anchor, FETCH_AMOUNT + )); + // let mut future: std::pin::Pin< + // Box< + // dyn Future> + // + Send + // + Sync + // + 'a, + // >, + // > + let mut future = result.into_future(); + self.db_future = Some(future); + self.db_future.as_mut().unwrap() + }; + + let Poll::Ready(result) = Future::poll(db_future.as_mut(), cx) else { + return Poll::Pending; + }; + self.db_future = None; + + let mut result = result.unwrap(); + + // let mut result: surrealdb::method::Query<'_, Any> = self + // .db + // // TODO: Fix the possible SQL injection + // .query(format!( + // "SELECT id,in,out FROM follow:{}.. LIMIT {};", + // self.anchor, FETCH_AMOUNT + // )); + let follows: Vec = result.take(0).unwrap(); + + let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { + // sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; + // continue; + // TODO: Sleep again + return Poll::Pending; + }; + self.anchor = format!("{}", anchor_key); + + for follow in &follows { + for record_id in [&follow.from, &follow.to] { + let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); + if self.processed_dids.contains(&did) { + continue; + } + self.processed_dids.insert(did.clone()); + self.buffer.push_back(did); + // tx.send(did) + // .await + // .context("Failed to send message to handler thread")?; + } + } + + if let Some(next) = self.buffer.pop_front() { + return Poll::Ready(Some(next)); + } + return Poll::Pending; + + // Warn if it looks like the queue size or the backoff were choosen incorrectly + // let new_follows = self.processed_dids.len() - processed_dids_before; + // if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { + // warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); + // } + } + } +} From 78c374fd53ea50709c64c03e7c97c66e263649db Mon Sep 17 00:00:00 2001 From: Zebreus Date: Mon, 24 Feb 2025 22:37:42 +0100 Subject: [PATCH 10/75] Add opentelemetry --- Cargo.toml | 11 ++ src/config.rs | 8 +- src/database/definitions.rs | 2 +- src/database/handlers.rs | 2 +- src/database/mod.rs | 2 +- src/database/repo_indexer.rs | 13 +-- src/database/repo_indexer/index_repo.rs | 4 +- src/database/repo_indexer/repo_stream.rs | 2 +- .../repo_indexer/repo_stream_nofuture.rs | 2 +- src/main.rs | 106 +++++++++++++++--- src/websocket/conn.rs | 2 +- src/websocket/mod.rs | 13 +-- 12 files changed, 121 insertions(+), 46 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f96bfba..cab3aa5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,17 @@ serde_ipld_dagcbor = "0.6.1" serde_bytes = "0.11.15" async-channel = "2.3.1" console-subscriber = "0.4.1" +opentelemetry = { version = "0.28.0", features = ["metrics"] } +opentelemetry_sdk = { version = "0.28.0", features = ["metrics", "rt-tokio"] } +opentelemetry-stdout = { version = "0.28.0", features = ["metrics", "trace"] } +opentelemetry-otlp = { version = "0.28.0", features = [ + "grpc-tonic", + "metrics", +] } +tracing = "0.1.41" +tracing-subscriber = "0.3.19" +opentelemetry-appender-tracing = "0.28.1" +tonic = "0.12.3" [profile.release] lto = false diff --git a/src/config.rs b/src/config.rs index f886b79..166d327 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,6 +1,6 @@ use clap::{ArgAction, Parser}; use colored::Colorize; -use log::{info, LevelFilter}; +use tracing::{info, level_filters::LevelFilter}; /// Command line arguments #[derive(Parser, Debug)] @@ -64,9 +64,9 @@ impl Args { /// Verbosity to log level pub fn log_level(self: &Self) -> LevelFilter { match self.verbosity { - 0 => LevelFilter::Info, - 1 => LevelFilter::Debug, - _ => LevelFilter::Trace, + 0 => LevelFilter::INFO, + 1 => LevelFilter::DEBUG, + _ => LevelFilter::TRACE, } } } diff --git a/src/database/definitions.rs b/src/database/definitions.rs index 306c737..d4f1292 100644 --- a/src/database/definitions.rs +++ b/src/database/definitions.rs @@ -1,7 +1,7 @@ use anyhow::Context; -use log::{debug, info}; use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Datetime, RecordId, Surreal}; +use tracing::{debug, info}; /// Database struct for a bluesky profile #[derive(Debug, Serialize)] diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 63e8781..6dd550f 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -9,8 +9,8 @@ use atrium_api::{ }, }; use chrono::Utc; -use log::warn; use surrealdb::{engine::any::Any, RecordId, Surreal}; +use tracing::warn; use crate::websocket::events::{Commit, Kind}; diff --git a/src/database/mod.rs b/src/database/mod.rs index 4187cce..45cc810 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,7 +1,7 @@ use anyhow::{Context, Result}; use definitions::{JetstreamCursor, Record}; -use log::{debug, info}; use surrealdb::{engine::any::Any, opt::auth::Root, RecordId, Surreal}; +use tracing::{debug, info}; pub mod definitions; pub mod handlers; diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 3b7a806..6e9c95f 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,10 +1,10 @@ use futures::StreamExt; use index_repo::PipelineItem; -use log::{error, info}; use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Surreal}; +use tracing::{error, info}; mod index_repo; mod repo_stream; @@ -31,23 +31,14 @@ const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; /// The size of the buffer between each pipeline stage in elements const BUFFER_SIZE: usize = 200; -pub async fn to_async(did: String) -> String { - println!("did: {}", did); - did -} - pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { let http_client = Client::new(); info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) - .map(to_async) + .map(|did| async { did }) .buffer_unordered(BUFFER_SIZE) - .map(|x| { - println!("gonna process {}", x); - x - }) .map(|did| { let db = &db; let http_client = &http_client; diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 7fc43b9..66ce3d7 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -8,13 +8,13 @@ use futures::TryStreamExt; use hyper::body::Bytes; use ipld_core::cid::{Cid, CidGeneric}; use iroh_car::CarReader; -use log::warn; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; use std::{collections::BTreeMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; use tokio::task::spawn_blocking; +use tracing::{trace, warn}; /// There should only be one request client to make use of connection pooling // TODO: Dont use a global client @@ -398,6 +398,6 @@ impl<'a> PipelineItem<'a, WithUpdates> { impl<'a> PipelineItem<'a, Done> { pub async fn print_report(self) -> () { // TODO: This is only for printing debug stuff - println!("Indexed {}", self.did); + trace!("Indexed {}", self.did); } } diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index 5b91dff..45b400c 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -5,8 +5,8 @@ use std::{ }; use futures::Stream; -use log::info; use surrealdb::{engine::any::Any, Surreal}; +use tracing::info; use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; diff --git a/src/database/repo_indexer/repo_stream_nofuture.rs b/src/database/repo_indexer/repo_stream_nofuture.rs index 928bb18..4792629 100644 --- a/src/database/repo_indexer/repo_stream_nofuture.rs +++ b/src/database/repo_indexer/repo_stream_nofuture.rs @@ -5,8 +5,8 @@ use std::{ }; use futures::Stream; -use log::info; use surrealdb::{engine::any::Any, Surreal}; +use tracing::info; use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; diff --git a/src/main.rs b/src/main.rs index fce0bd6..bc1c0bb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,30 +1,92 @@ -use std::sync::atomic::{AtomicUsize, Ordering}; - -use ::log::{error, info}; use anyhow::Context; use config::Args; use database::repo_indexer::start_full_repo_indexer; +use opentelemetry::global; +use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; +use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; +use opentelemetry_sdk::{ + logs::SdkLoggerProvider, metrics::SdkMeterProvider, trace::SdkTracerProvider, Resource, +}; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + LazyLock, +}; use surrealdb::{engine::any::Any, Surreal}; use tokio::runtime::Builder; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; +use tracing::{error, info}; +use tracing_subscriber::{prelude::*, EnvFilter}; mod config; mod database; -mod log; mod websocket; /// Override the global allocator with mimalloc #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +const RESOURCE: LazyLock = LazyLock::new(|| { + Resource::builder() + .with_service_name("rust-indexer") + .build() +}); + +fn init_logger() -> SdkLoggerProvider { + let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); + let logger_provider = SdkLoggerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_batch_exporter(otlp_log_exporter) + .build(); + let otel_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let otel_layer = OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_filter); + + let tokio_console_layer = console_subscriber::spawn(); + + let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + let stdout_layer = tracing_subscriber::fmt::layer() + .with_thread_names(true) + .with_filter(stdout_filter); + + tracing_subscriber::registry() + .with(tokio_console_layer) + .with(otel_layer) + .with(stdout_layer) + .init(); + logger_provider +} + +fn init_metrics() -> SdkMeterProvider { + let otlp_metric_exporter = MetricExporter::builder().with_tonic().build().unwrap(); + + let meter_provider = SdkMeterProvider::builder() + // .with_periodic_exporter(exporter) + .with_periodic_exporter(otlp_metric_exporter) + .with_resource(RESOURCE.clone()) + .build(); + global::set_meter_provider(meter_provider.clone()); + + meter_provider +} + +fn init_tracer() -> SdkTracerProvider { + let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); + + let tracer_provider = SdkTracerProvider::builder() + .with_simple_exporter(otlp_span_exporter) + .build(); + global::set_tracer_provider(tracer_provider.clone()); + + tracer_provider +} + /// Entry point for the application fn main() { - console_subscriber::init(); - // parse command line arguments let args = config::parse_args(); - - // initialize logging and dump configuration - log::init(args.log_level()); args.dump(); // build async runtime @@ -37,7 +99,7 @@ fn main() { .thread_name_fn(|| { static ATOMIC: AtomicUsize = AtomicUsize::new(0); let id = ATOMIC.fetch_add(1, Ordering::Relaxed); - format!("Tokio Async Thread {}", id) + format!("Thread {}", id) }) .build() .unwrap() @@ -49,7 +111,7 @@ fn main() { .thread_name_fn(|| { static ATOMIC: AtomicUsize = AtomicUsize::new(0); let id = ATOMIC.fetch_add(1, Ordering::Relaxed); - format!("Tokio Async Thread {}", id) + format!("Thread {}", id) }) .build() .unwrap() @@ -68,6 +130,10 @@ fn main() { /// Asynchronous main function async fn application_main(args: Args) -> anyhow::Result<()> { + let tracer_provider = init_tracer(); + let metrics_provider = init_metrics(); + let logger_provider = init_logger(); + // connect to the database let db = database::connect(args.db, &args.username, &args.password) .await @@ -84,9 +150,9 @@ async fn application_main(args: Args) -> anyhow::Result<()> { for host in jetstream_hosts { let db_clone = db.clone(); let certificate = args.certificate.clone(); - + let (name, _) = host.split_at(18); std::thread::Builder::new() - .name(format!("Jetstream Consumer {}", host)) + .name(format!("{}", name)) .spawn(move || { Builder::new_current_thread() .enable_io() @@ -124,12 +190,20 @@ async fn application_main(args: Args) -> anyhow::Result<()> { if args.mode == "full" { start_full_repo_indexer(db, args.max_tasks.unwrap_or(num_cpus::get() * 50)).await?; + } else { + loop { + tokio::time::sleep(std::time::Duration::from_millis(1000)).await; + } } - loop { - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } + // TODO: Also handle shutdown when an error occurs + tracer_provider.shutdown().unwrap(); + metrics_provider.shutdown().unwrap(); + logger_provider.shutdown().unwrap(); + + Ok(()) } + async fn start_jetstream_consumer( db: Surreal, host: String, diff --git a/src/websocket/conn.rs b/src/websocket/conn.rs index 374bc8b..9d6c4b4 100644 --- a/src/websocket/conn.rs +++ b/src/websocket/conn.rs @@ -9,9 +9,9 @@ use hyper::{ Request, }; use hyper_util::rt::TokioIo; -use log::{debug, info}; use tokio::{net::TcpStream, task}; use tokio_rustls::{rustls::pki_types::ServerName, TlsConnector}; +use tracing::{debug, info}; /// A tokio executor for hyper struct TokioExecutor; diff --git a/src/websocket/mod.rs b/src/websocket/mod.rs index 1c01dd6..61f810b 100644 --- a/src/websocket/mod.rs +++ b/src/websocket/mod.rs @@ -1,3 +1,7 @@ +use anyhow::Context; +use fastwebsockets::{OpCode, WebSocket}; +use hyper::upgrade::Upgraded; +use hyper_util::rt::TokioIo; use std::{ sync::{ atomic::{AtomicU64, Ordering}, @@ -5,12 +9,6 @@ use std::{ }, time::{Duration, Instant}, }; - -use anyhow::Context; -use fastwebsockets::{OpCode, WebSocket}; -use hyper::upgrade::Upgraded; -use hyper_util::rt::TokioIo; -use log::{info, trace, warn}; use surrealdb::{engine::any::Any, Surreal}; use tokio::time::sleep; use tokio_rustls::{ @@ -20,6 +18,7 @@ use tokio_rustls::{ }, TlsConnector, }; +use tracing::{debug, info, trace, warn}; mod conn; pub mod events; @@ -49,7 +48,7 @@ pub async fn start( ) -> anyhow::Result<()> { // prepare tls store let cloned_certificate_path = certificate.clone(); - log::debug!(target: "indexer", "Creating tls store for certificate: {}", cloned_certificate_path); + debug!(target: "indexer", "Creating tls store for certificate: {}", cloned_certificate_path); let mut tls_store = RootCertStore::empty(); let tls_cert = CertificateDer::from_pem_file(certificate).with_context(|| { format!( From a4cbbf4524e5c963bffa43fc503a4c965d56985a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Mon, 24 Feb 2025 22:38:12 +0100 Subject: [PATCH 11/75] Update dependencies --- Cargo.toml | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cab3aa5..2d85407 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,37 +6,42 @@ authors = ["redsolver", "PancakeTAS"] description = "ATProto/Bluesky Indexer powered by SurrealDB and Jetstream" [dependencies] -anyhow = "1.0.93" -hyper = "1.5.1" +anyhow = "1.0.96" +hyper = "1.6.0" hyper-util = "0.1.10" -tokio = { version = "1.41.1", features = ["parking_lot", "rt-multi-thread"] } -tokio-rustls = "0.26.0" +tokio = { version = "1.43.0", features = [ + "parking_lot", + "rt-multi-thread", + "tracing", + "full", +] } +tokio-rustls = "0.26.1" tokio-util = { version = "0.7.13", features = ["io"] } rsky-pds = { git = "https://github.com/blacksky-algorithms/rsky.git" } -fastwebsockets = { version = "0.8.0", features = ["upgrade"] } -atrium-api = { version = "0.24.8", default-features = false, features = [ +fastwebsockets = { version = "0.10.0", features = ["upgrade"] } +atrium-api = { version = "0.25.0", default-features = false, features = [ "namespace-appbsky", "namespace-chatbsky", ] } -serde = { version = "1.0.215", features = ["derive"] } +serde = { version = "1.0.218", features = ["derive"] } simd-json = "0.14.3" num_cpus = "1.16.0" -log = "0.4.22" -clap = { version = "4.5.21", features = ["derive"] } +# log = "0.4.22" +clap = { version = "4.5.31", features = ["derive"] } colog = "1.3.0" -colored = "2.1.0" -chrono = "0.4.38" +colored = "3.0.0" +chrono = "0.4.39" mimalloc = "0.1.43" -surrealdb = { version = "2.1.3", features = ["kv-mem", "kv-rocksdb"] } +surrealdb = { version = "2.2.1", features = ["kv-mem", "kv-rocksdb"] } surrealdb-tikv-client = "0.3.0-surreal.1" regex = "1.11.1" lazy_static = "1.5.0" -ipld-core = "0.4.1" -atrium-xrpc-client = "0.5.10" -reqwest = { version = "0.12.9", features = ["json", "stream"] } +ipld-core = "0.4.2" +atrium-xrpc-client = "0.5.11" +reqwest = { version = "0.12.12", features = ["json", "stream"] } iroh-car = "0.5.1" futures = "0.3.31" -serde_ipld_dagcbor = "0.6.1" +serde_ipld_dagcbor = "0.6.2" serde_bytes = "0.11.15" async-channel = "2.3.1" console-subscriber = "0.4.1" From d5ccdb830c271bf4030e978dbbff4d51253c022c Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 13:01:03 +0100 Subject: [PATCH 12/75] Add information about opentelemetry to README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 08e0c2d..54d647a 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,15 @@ You may need to increase the ulimit for the number of open files. You can do thi ### tokio You can use tokio-console to get more insights into what the tokio tasks are currently doing. Just run `tokio-console` while the indexer is running. + +### opentelemetry + +The application uses opentelemetry for metrics, traces, and logs. It exports signal via the OTLP grpc protocol. You can configure the exporter with the usual opentelemetry environment variables. + +The spin up a docker container with a collector and grafana use + +``` +docker run -p 3000:3000 -p 4317:4317 --rm -ti grafana/otel-lgtm +``` + +and then visit `localhost:3000` and login as `admin` with password `admin`. From 046554354ce15fd40d0c6167b45e861540aada8e Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 17:46:50 +0100 Subject: [PATCH 13/75] Add system metrics reporting --- Cargo.toml | 5 ++ src/main.rs | 8 +- src/metrics_reporter.rs | 182 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 4 deletions(-) create mode 100644 src/metrics_reporter.rs diff --git a/Cargo.toml b/Cargo.toml index 2d85407..62d4f02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,11 @@ tracing = "0.1.41" tracing-subscriber = "0.3.19" opentelemetry-appender-tracing = "0.28.1" tonic = "0.12.3" +opentelemetry-semantic-conventions = { version = "0.28.0", features = [ + "semconv_experimental", +] } +sys-info = "0.9.1" +sysinfo = "0.33.1" [profile.release] lto = false diff --git a/src/main.rs b/src/main.rs index bc1c0bb..134512b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ use anyhow::Context; use config::Args; use database::repo_indexer::start_full_repo_indexer; -use opentelemetry::global; +use metrics_reporter::export_system_metrics; use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; use opentelemetry_sdk::{ @@ -19,6 +19,7 @@ use tracing_subscriber::{prelude::*, EnvFilter}; mod config; mod database; +mod metrics_reporter; mod websocket; /// Override the global allocator with mimalloc @@ -130,9 +131,8 @@ fn main() { /// Asynchronous main function async fn application_main(args: Args) -> anyhow::Result<()> { - let tracer_provider = init_tracer(); - let metrics_provider = init_metrics(); - let logger_provider = init_logger(); + // Start exporting system metrics + tokio::task::spawn_blocking(export_system_metrics); // connect to the database let db = database::connect(args.db, &args.username, &args.password) diff --git a/src/metrics_reporter.rs b/src/metrics_reporter.rs new file mode 100644 index 0000000..c10a89f --- /dev/null +++ b/src/metrics_reporter.rs @@ -0,0 +1,182 @@ +use opentelemetry::{global, KeyValue}; +use opentelemetry_semantic_conventions::{ + attribute::{ + NETWORK_INTERFACE_NAME, NETWORK_IO_DIRECTION, SYSTEM_CPU_LOGICAL_NUMBER, SYSTEM_DEVICE, + SYSTEM_MEMORY_STATE, + }, + metric::{ + SYSTEM_CPU_FREQUENCY, SYSTEM_CPU_LOGICAL_COUNT, SYSTEM_CPU_UTILIZATION, + SYSTEM_LINUX_MEMORY_AVAILABLE, SYSTEM_MEMORY_LIMIT, SYSTEM_MEMORY_USAGE, + SYSTEM_MEMORY_UTILIZATION, SYSTEM_NETWORK_ERRORS, SYSTEM_NETWORK_IO, + SYSTEM_NETWORK_PACKETS, + }, +}; +use std::time::Duration; +use sysinfo::{Networks, System}; +use tokio::task::{block_in_place, yield_now}; + +const METRICS_INTERVAL: Duration = Duration::from_secs(2); +pub async fn export_system_metrics() { + let meter = global::meter("system"); + + let mut system = System::new_all(); + let mut networks = Networks::new(); + tokio::task::block_in_place(|| { + system.refresh_all(); + networks.refresh(true); + }); + yield_now().await; + + // let uptime_meter = meter + // .f64_gauge(SYSTEM_UPTIME) + // .with_description("The time the system has been running") + // .build(); + let cpu_utilization_meter = meter + .f64_gauge(SYSTEM_CPU_UTILIZATION) + .with_description("Difference in system.cpu.time since the last measurement, divided by the elapsed time and number of logical CPUs") + .build(); + let cpu_logical_count_meter = meter + .i64_up_down_counter(SYSTEM_CPU_LOGICAL_COUNT) + .with_description("Reports the number of logical (virtual) processor cores created by the operating system to manage multitasking") + .build(); + let cpu_frequency_meter = meter + .f64_gauge(SYSTEM_CPU_FREQUENCY) + .with_description("Reports the current frequency of the CPU in Hz") + .build(); + let memory_usage_meter = meter + .i64_up_down_counter(SYSTEM_MEMORY_USAGE) + .with_description("Reports memory in use by state") + .build(); + let memory_limit_meter = meter + .i64_up_down_counter(SYSTEM_MEMORY_LIMIT) + .with_description("Total memory available in the system") + .build(); + let memory_utilization_meter = meter.f64_gauge(SYSTEM_MEMORY_UTILIZATION).build(); + let memory_available_meter = meter + .i64_up_down_counter(SYSTEM_LINUX_MEMORY_AVAILABLE) + .with_description("An estimate of how much memory is available for starting new applications, without causing swapping") + .build(); + // let network_dropped_meter = meter + // .u64_counter(SYSTEM_NETWORK_DROPPED) + // .with_description( + // "Count of packets that are dropped or discarded even though there was no error", + // ) + // .build(); + let network_packets_meter = meter.u64_counter(SYSTEM_NETWORK_PACKETS).build(); + let network_errors_meter = meter.u64_counter(SYSTEM_NETWORK_ERRORS).build(); + let network_io_meter = meter.u64_counter(SYSTEM_NETWORK_IO).build(); + // let network_connections_meter = meter + // .i64_up_down_counter(SYSTEM_NETWORK_CONNECTIONS) + // .build(); + + let mut previous_cpu_logical_count = 0; + let mut previous_free_memory = 0u64; + let mut previous_total_memory = 0u64; + let mut previous_used_memory = 0u64; + let mut previous_availabe_memory = 0u64; + + let mut last_update = tokio::time::Instant::now(); + loop { + tokio::time::sleep_until(last_update + METRICS_INTERVAL).await; + last_update = tokio::time::Instant::now(); + + block_in_place(|| { + system.refresh_cpu_all(); + system.refresh_memory(); + networks.refresh(true); + }); + yield_now().await; + block_in_place(|| { + for (id, cpu) in system.cpus().iter().enumerate() { + cpu_utilization_meter.record( + cpu.cpu_usage() as f64, + &[KeyValue::new(SYSTEM_CPU_LOGICAL_NUMBER, id as i64)], + ); + cpu_frequency_meter.record( + cpu.frequency() as f64, + &[KeyValue::new(SYSTEM_CPU_LOGICAL_NUMBER, id as i64)], + ); + } + cpu_utilization_meter.record(system.global_cpu_usage() as f64, &[]); + + cpu_logical_count_meter + .add(system.cpus().len() as i64 - previous_cpu_logical_count, &[]); + previous_cpu_logical_count = system.cpus().len() as i64; + + memory_usage_meter.add( + system.used_memory() as i64 - previous_used_memory as i64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "used")], + ); + memory_usage_meter.add( + system.free_memory() as i64 - previous_free_memory as i64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "free")], + ); + memory_limit_meter.add( + system.total_memory() as i64 - previous_total_memory as i64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "total")], + ); + memory_utilization_meter.record( + system.used_memory() as f64 / system.total_memory() as f64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "used")], + ); + memory_utilization_meter.record( + system.free_memory() as f64 / system.total_memory() as f64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "free")], + ); + memory_available_meter.add( + system.available_memory() as i64 - previous_availabe_memory as i64, + &[KeyValue::new(SYSTEM_MEMORY_STATE, "available")], + ); + + previous_free_memory = system.free_memory(); + previous_total_memory = system.total_memory(); + previous_used_memory = system.used_memory(); + previous_availabe_memory = system.available_memory(); + + for (name, data) in networks.iter() { + network_packets_meter.add( + data.packets_received(), + &[ + KeyValue::new(SYSTEM_DEVICE, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "receive"), + ], + ); + network_packets_meter.add( + data.packets_transmitted(), + &[ + KeyValue::new(SYSTEM_DEVICE, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "transmit"), + ], + ); + network_errors_meter.add( + data.errors_on_received(), + &[ + KeyValue::new(NETWORK_INTERFACE_NAME, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "receive"), + ], + ); + network_errors_meter.add( + data.errors_on_transmitted(), + &[ + KeyValue::new(NETWORK_INTERFACE_NAME, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "transmit"), + ], + ); + network_io_meter.add( + data.received(), + &[ + KeyValue::new(NETWORK_INTERFACE_NAME, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "receive"), + ], + ); + network_io_meter.add( + data.transmitted(), + &[ + KeyValue::new(NETWORK_INTERFACE_NAME, name.clone()), + KeyValue::new(NETWORK_IO_DIRECTION, "transmit"), + ], + ); + } + }); + } +} From c467685a5cff50e46110edf2fb96e20ca00c63a5 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 18:21:39 +0100 Subject: [PATCH 14/75] Improve units in metrics reporter --- src/metrics_reporter.rs | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/metrics_reporter.rs b/src/metrics_reporter.rs index c10a89f..37f0f3f 100644 --- a/src/metrics_reporter.rs +++ b/src/metrics_reporter.rs @@ -27,47 +27,49 @@ pub async fn export_system_metrics() { }); yield_now().await; - // let uptime_meter = meter - // .f64_gauge(SYSTEM_UPTIME) - // .with_description("The time the system has been running") - // .build(); let cpu_utilization_meter = meter .f64_gauge(SYSTEM_CPU_UTILIZATION) .with_description("Difference in system.cpu.time since the last measurement, divided by the elapsed time and number of logical CPUs") + .with_unit("1") .build(); let cpu_logical_count_meter = meter .i64_up_down_counter(SYSTEM_CPU_LOGICAL_COUNT) .with_description("Reports the number of logical (virtual) processor cores created by the operating system to manage multitasking") + .with_unit("{cpu}") .build(); let cpu_frequency_meter = meter .f64_gauge(SYSTEM_CPU_FREQUENCY) .with_description("Reports the current frequency of the CPU in Hz") + .with_unit("{Hz}") .build(); let memory_usage_meter = meter .i64_up_down_counter(SYSTEM_MEMORY_USAGE) .with_description("Reports memory in use by state") + .with_unit("By") .build(); let memory_limit_meter = meter .i64_up_down_counter(SYSTEM_MEMORY_LIMIT) .with_description("Total memory available in the system") + .with_unit("By") + .build(); + let memory_utilization_meter = meter + .f64_gauge(SYSTEM_MEMORY_UTILIZATION) + .with_unit("1") .build(); - let memory_utilization_meter = meter.f64_gauge(SYSTEM_MEMORY_UTILIZATION).build(); let memory_available_meter = meter .i64_up_down_counter(SYSTEM_LINUX_MEMORY_AVAILABLE) .with_description("An estimate of how much memory is available for starting new applications, without causing swapping") + .with_unit("By") + .build(); + let network_packets_meter = meter + .u64_counter(SYSTEM_NETWORK_PACKETS) + .with_unit("{packet}") + .build(); + let network_errors_meter = meter + .u64_counter(SYSTEM_NETWORK_ERRORS) + .with_unit("{error}") .build(); - // let network_dropped_meter = meter - // .u64_counter(SYSTEM_NETWORK_DROPPED) - // .with_description( - // "Count of packets that are dropped or discarded even though there was no error", - // ) - // .build(); - let network_packets_meter = meter.u64_counter(SYSTEM_NETWORK_PACKETS).build(); - let network_errors_meter = meter.u64_counter(SYSTEM_NETWORK_ERRORS).build(); - let network_io_meter = meter.u64_counter(SYSTEM_NETWORK_IO).build(); - // let network_connections_meter = meter - // .i64_up_down_counter(SYSTEM_NETWORK_CONNECTIONS) - // .build(); + let network_io_meter = meter.u64_counter(SYSTEM_NETWORK_IO).with_unit("By").build(); let mut previous_cpu_logical_count = 0; let mut previous_free_memory = 0u64; From d24fd4deca1bd8dd86985ab5f4231f440fa327a0 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 18:39:35 +0100 Subject: [PATCH 15/75] Improve telemetry setup --- Cargo.toml | 2 + src/main.rs | 232 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 197 insertions(+), 37 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 62d4f02..0d500be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,8 @@ tonic = "0.12.3" opentelemetry-semantic-conventions = { version = "0.28.0", features = [ "semconv_experimental", ] } +tracing-opentelemetry = "0.29.0" +opentelemetry-resource-detectors = "0.7.0" sys-info = "0.9.1" sysinfo = "0.33.1" diff --git a/src/main.rs b/src/main.rs index 134512b..0532313 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,21 +1,40 @@ +#![feature(type_changing_struct_update)] + use anyhow::Context; use config::Args; use database::repo_indexer::start_full_repo_indexer; use metrics_reporter::export_system_metrics; +use opentelemetry::{global, trace::TracerProvider as _, KeyValue}; use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; +use opentelemetry_resource_detectors::{ + HostResourceDetector, OsResourceDetector, ProcessResourceDetector, +}; use opentelemetry_sdk::{ - logs::SdkLoggerProvider, metrics::SdkMeterProvider, trace::SdkTracerProvider, Resource, + logs::SdkLoggerProvider, + metrics::{PeriodicReader, SdkMeterProvider}, + propagation::TraceContextPropagator, + resource::EnvResourceDetector, + trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, + Resource, }; -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - LazyLock, +use opentelemetry_semantic_conventions::{ + attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, + resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION, SERVICE_INSTANCE_ID}, + SCHEMA_URL, }; -use surrealdb::{engine::any::Any, Surreal}; -use tokio::runtime::Builder; +use std::{ + process::exit, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, LazyLock, + }, +}; +use surrealdb::{engine::any::Any, Surreal, Uuid}; +use tokio::{runtime::Builder, signal::ctrl_c, time::interval_at}; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; -use tracing::{error, info}; -use tracing_subscriber::{prelude::*, EnvFilter}; +use tracing::{error, span}; +use tracing_subscriber::{prelude::*, EnvFilter, Registry}; mod config; mod database; @@ -27,24 +46,74 @@ mod websocket; static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; const RESOURCE: LazyLock = LazyLock::new(|| { + let instance_id = Uuid::new_v4(); + + let mut attributes = vec![ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ]; + + if let Ok(linux_sys_info) = sys_info::linux_os_release() { + if let Some(build_id) = linux_sys_info.build_id { + attributes.push(KeyValue::new(OS_BUILD_ID, build_id)); + } + if let Some(pretty_name) = linux_sys_info.pretty_name { + attributes.push(KeyValue::new(OS_DESCRIPTION, pretty_name)); + } + if let Some(name) = linux_sys_info.name { + attributes.push(KeyValue::new(OS_NAME, name)); + } + if let Some(version_id) = linux_sys_info.version_id { + attributes.push(KeyValue::new(OS_VERSION, version_id)); + } + } else { + if let Ok(os_version) = sys_info::os_release() { + attributes.push(KeyValue::new(OS_DESCRIPTION, os_version)); + } + if let Ok(os_name) = sys_info::os_type() { + attributes.push(KeyValue::new(OS_NAME, os_name)); + } + } + + if let Ok(hostname) = sys_info::hostname() { + attributes.push(KeyValue::new(HOST_NAME, hostname)); + } + Resource::builder() - .with_service_name("rust-indexer") + .with_schema_url( + [ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ], + SCHEMA_URL, + ) + .with_attributes(attributes) + .with_detectors(&[ + Box::new(EnvResourceDetector::new()), + Box::new(HostResourceDetector::default()), + Box::new(ProcessResourceDetector), + Box::new(OsResourceDetector), + // Box::new(OsResourceDetector::new()), + ]) .build() }); -fn init_logger() -> SdkLoggerProvider { - let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); - let logger_provider = SdkLoggerProvider::builder() - .with_resource(RESOURCE.clone()) - .with_batch_exporter(otlp_log_exporter) - .build(); - let otel_filter = EnvFilter::new("info") +fn init_observability() -> OtelGuard { + let tracer_provider = init_tracer(); + let meter_provider = init_meter(); + let logger_provider = init_logger(); + + let otel_log_filter = EnvFilter::new("info") .add_directive("hyper=off".parse().unwrap()) .add_directive("h2=off".parse().unwrap()) .add_directive("opentelemetry=off".parse().unwrap()) .add_directive("tonic=off".parse().unwrap()) .add_directive("reqwest=off".parse().unwrap()); - let otel_layer = OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_filter); + let otel_log_layer = + OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); let tokio_console_layer = console_subscriber::spawn(); @@ -53,21 +122,46 @@ fn init_logger() -> SdkLoggerProvider { .with_thread_names(true) .with_filter(stdout_filter); + let tracer = tracer_provider.tracer("tracing-otel-subscriber"); tracing_subscriber::registry() .with(tokio_console_layer) - .with(otel_layer) + .with(otel_log_layer) .with(stdout_layer) + .with(tracing_opentelemetry::MetricsLayer::new( + meter_provider.clone(), + )) + .with(tracing_opentelemetry::OpenTelemetryLayer::new(tracer)) .init(); + OtelGuard { + tracer_provider, + meter_provider, + logger_provider, + } +} + +fn init_logger() -> SdkLoggerProvider { + let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); + let logger_provider = SdkLoggerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_batch_exporter(otlp_log_exporter) + .build(); logger_provider } -fn init_metrics() -> SdkMeterProvider { - let otlp_metric_exporter = MetricExporter::builder().with_tonic().build().unwrap(); +fn init_meter() -> SdkMeterProvider { + let otlp_metric_exporter = MetricExporter::builder() + .with_tonic() + .with_temporality(opentelemetry_sdk::metrics::Temporality::default()) + .build() + .unwrap(); + + let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) + .with_interval(std::time::Duration::from_secs(5)) + .build(); let meter_provider = SdkMeterProvider::builder() - // .with_periodic_exporter(exporter) - .with_periodic_exporter(otlp_metric_exporter) .with_resource(RESOURCE.clone()) + .with_reader(periodic_reader) .build(); global::set_meter_provider(meter_provider.clone()); @@ -75,16 +169,45 @@ fn init_metrics() -> SdkMeterProvider { } fn init_tracer() -> SdkTracerProvider { + global::set_text_map_propagator(TraceContextPropagator::new()); + let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); let tracer_provider = SdkTracerProvider::builder() - .with_simple_exporter(otlp_span_exporter) + .with_resource(RESOURCE.clone()) + .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( + 1.0, + )))) + .with_id_generator(RandomIdGenerator::default()) + .with_batch_exporter(otlp_span_exporter) + // .with_simple_exporter(otlp_span_exporter) .build(); global::set_tracer_provider(tracer_provider.clone()); tracer_provider } +struct OtelGuard { + tracer_provider: SdkTracerProvider, + meter_provider: SdkMeterProvider, + logger_provider: SdkLoggerProvider, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + eprintln!("Shutting down observability"); + if let Err(err) = self.tracer_provider.shutdown() { + eprintln!("{err:?}"); + } + if let Err(err) = self.meter_provider.shutdown() { + eprintln!("{err:?}"); + } + if let Err(err) = self.logger_provider.shutdown() { + eprintln!("{err:?}"); + } + } +} + /// Entry point for the application fn main() { let args = config::parse_args(); @@ -131,8 +254,40 @@ fn main() { /// Asynchronous main function async fn application_main(args: Args) -> anyhow::Result<()> { + let otel_guard = Arc::new(init_observability()); + + let handler_otel_guard = otel_guard.clone(); + tokio::spawn(async move { + ctrl_c().await.unwrap(); + eprintln!("Preparing for unclean exit"); + + handler_otel_guard.logger_provider.shutdown().unwrap(); + handler_otel_guard.meter_provider.shutdown().unwrap(); + handler_otel_guard.tracer_provider.shutdown().unwrap(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + eprintln!("Exiting"); + exit(1); + }); + // Start exporting system metrics - tokio::task::spawn_blocking(export_system_metrics); + tokio::task::spawn(export_system_metrics()); + + // // Create a tracing layer with the configured tracer + // let telemetry = + // tracing_opentelemetry::layer().with_tracer(otel_guard.tracer_provider.tracer("testingsss")); + + // // Use the tracing subscriber `Registry`, or any other subscriber + // // that impls `LookupSpan` + // let subscriber = Registry::default().with(telemetry); + // // Trace executed code + // tracing::subscriber::with_default(subscriber, || { + // // Spans will be sent to the configured OpenTelemetry exporter + // let root = span!(tracing::Level::TRACE, "app_start", work_units = 2); + // let _enter = root.enter(); + + // error!("This event will be logged in the root span."); + // }); // connect to the database let db = database::connect(args.db, &args.username, &args.password) @@ -171,20 +326,28 @@ async fn application_main(args: Args) -> anyhow::Result<()> { let db_clone = db.clone(); tokio::spawn(async move { - let mut last_count = 0; + let indexed_repos_counter = global::meter("indexer") + .u64_counter("indexer.repos.indexed") + .with_description("Total number of indexed repos") + .with_unit("{repo}") + .build(); + let mut last_count: u64 = 0; + let mut interval = interval_at( + tokio::time::Instant::now(), + tokio::time::Duration::from_secs(2), + ); loop { let mut res = db_clone .query("SELECT count() as c FROM li_did GROUP ALL;") .await .unwrap(); let count: Option = res.take((0, "c")).unwrap(); - info!( - "fully indexed repo count: {} with {} repos/10s", - count.unwrap_or(0), - count.unwrap_or(0) - last_count - ); - last_count = count.unwrap_or(0); - tokio::time::sleep(std::time::Duration::from_millis(10000)).await; + let count = count.unwrap_or(last_count as i64) as u64; + + indexed_repos_counter.add(count - last_count, &[]); + last_count = count; + + interval.tick().await; } }); @@ -196,11 +359,6 @@ async fn application_main(args: Args) -> anyhow::Result<()> { } } - // TODO: Also handle shutdown when an error occurs - tracer_provider.shutdown().unwrap(); - metrics_provider.shutdown().unwrap(); - logger_provider.shutdown().unwrap(); - Ok(()) } From 51d2c995183262124cc0c233f079a75ec2d76b76 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 18:41:20 +0100 Subject: [PATCH 16/75] Improve interval accuracy in metrics reporter --- src/metrics_reporter.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/metrics_reporter.rs b/src/metrics_reporter.rs index 37f0f3f..634daaf 100644 --- a/src/metrics_reporter.rs +++ b/src/metrics_reporter.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use opentelemetry::{global, KeyValue}; use opentelemetry_semantic_conventions::{ attribute::{ @@ -11,9 +13,11 @@ use opentelemetry_semantic_conventions::{ SYSTEM_NETWORK_PACKETS, }, }; -use std::time::Duration; use sysinfo::{Networks, System}; -use tokio::task::{block_in_place, yield_now}; +use tokio::{ + task::{block_in_place, yield_now}, + time::{interval_at, Instant}, +}; const METRICS_INTERVAL: Duration = Duration::from_secs(2); pub async fn export_system_metrics() { @@ -77,10 +81,9 @@ pub async fn export_system_metrics() { let mut previous_used_memory = 0u64; let mut previous_availabe_memory = 0u64; - let mut last_update = tokio::time::Instant::now(); + let mut interval = interval_at(Instant::now(), METRICS_INTERVAL); loop { - tokio::time::sleep_until(last_update + METRICS_INTERVAL).await; - last_update = tokio::time::Instant::now(); + interval.tick().await; block_in_place(|| { system.refresh_cpu_all(); From eaf942902eed88fec3d1802240717b052c0a61a3 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 18:44:44 +0100 Subject: [PATCH 17/75] Refactor repo indexer task --- src/database/repo_indexer.rs | 57 ++---------------------------------- src/main.rs | 6 ++-- 2 files changed, 6 insertions(+), 57 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 6e9c95f..2d53254 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -4,7 +4,7 @@ use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Surreal}; -use tracing::{error, info}; +use tracing::error; mod index_repo; mod repo_stream; @@ -31,11 +31,9 @@ const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; /// The size of the buffer between each pipeline stage in elements const BUFFER_SIZE: usize = 200; -pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyhow::Result<()> { +pub async fn start_full_repo_indexer(db: &Surreal) -> anyhow::Result<()> { let http_client = Client::new(); - info!(target: "indexer", "Spinning up {} handler tasks", max_tasks); - RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) .map(|did| async { did }) .buffer_unordered(BUFFER_SIZE) @@ -93,60 +91,11 @@ pub async fn start_full_repo_indexer(db: Surreal, max_tasks: usize) -> anyh } result.ok() }) + .take(10) .for_each(|x| async { x.print_report().await; }) .await; panic!("Done, this should not happen"); - - // .map(|did| { - // // let state = state.clone(); - // let db = &db; - // let client = &client; - // async move { - // let result = index_repo(&db, &client, &did).await; - // if let Err(error) = result { - // warn!(target: "indexer", "Failed to index repo {}: {}", did, error); - - // let error_message = format!("{}", error); - // if format!("{}", error) == "Failed to parse CAR file: early eof" { - // // TODO: Document what this case does - - // let did_key = crate::database::utils::did_to_key(did.as_str()).unwrap(); - // let timestamp_us = std::time::SystemTime::now() - // .duration_since(std::time::UNIX_EPOCH) - // .unwrap() - // .as_micros(); - // let _: Option = db - // .upsert(("li_did", did_key)) - // .content(LastIndexedTimestamp { - // time_us: timestamp_us as u64, - // time_dt: chrono::Utc::now().into(), - // error: Some(error_message), - // }) - // .await - // .unwrap(); - // } - // } - // } - // }) - // .buffer_unordered(200) - // .for_each(|x| async { - // println!("finished stream"); - // }) - // .await; - - // for thread_id in 0..max_tasks { - // let state = state.clone(); - // tokio::spawn(async move { - // let result = repo_fetcher_task(state).await; - // let error = result - // .and::<()>(Err(anyhow!("Handler thread should never exit"))) - // .unwrap_err(); - // error!(target: "indexer", "Handler thread {} failed: {:?}", thread_id, error); - // }); - // } - - // repo_discovery_task(state, tx).await.unwrap(); } diff --git a/src/main.rs b/src/main.rs index 0532313..03db01d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,8 +33,8 @@ use std::{ use surrealdb::{engine::any::Any, Surreal, Uuid}; use tokio::{runtime::Builder, signal::ctrl_c, time::interval_at}; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; -use tracing::{error, span}; -use tracing_subscriber::{prelude::*, EnvFilter, Registry}; +use tracing::error; +use tracing_subscriber::{prelude::*, EnvFilter}; mod config; mod database; @@ -352,7 +352,7 @@ async fn application_main(args: Args) -> anyhow::Result<()> { }); if args.mode == "full" { - start_full_repo_indexer(db, args.max_tasks.unwrap_or(num_cpus::get() * 50)).await?; + start_full_repo_indexer(&db).await?; } else { loop { tokio::time::sleep(std::time::Duration::from_millis(1000)).await; From ff3097282ce4f7844a0b233b078d34f3cfcfdef5 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 19:07:56 +0100 Subject: [PATCH 18/75] Moved indexed repos metric into the indexer --- src/database/repo_indexer.rs | 22 +++++++++++++++-- src/main.rs | 47 ++---------------------------------- 2 files changed, 22 insertions(+), 47 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 2d53254..04c67a8 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,10 +1,11 @@ use futures::StreamExt; use index_repo::PipelineItem; +use opentelemetry::global; use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Surreal}; -use tracing::error; +use tracing::{error, warn}; mod index_repo; mod repo_stream; @@ -34,6 +35,23 @@ const BUFFER_SIZE: usize = 200; pub async fn start_full_repo_indexer(db: &Surreal) -> anyhow::Result<()> { let http_client = Client::new(); + let meter = global::meter("indexer"); + let repos_indexed = meter + .u64_counter("indexer.repos.indexed") + .with_description("Total number of indexed repos") + .with_unit("repo") + .build(); + + let mut res = db + .query("SELECT count() as c FROM li_did GROUP ALL;") + .await + .unwrap(); + let count = res.take::>((0, "c")).unwrap().unwrap_or(0); + if count == 0 { + warn!("Started with 0 repos, this might be a bug"); + } + repos_indexed.add(count as u64, &[]); + RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) .map(|did| async { did }) .buffer_unordered(BUFFER_SIZE) @@ -91,9 +109,9 @@ pub async fn start_full_repo_indexer(db: &Surreal) -> anyhow::Result<()> { } result.ok() }) - .take(10) .for_each(|x| async { x.print_report().await; + repos_indexed.add(1, &[]); }) .await; diff --git a/src/main.rs b/src/main.rs index 03db01d..39609a0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,7 +31,7 @@ use std::{ }, }; use surrealdb::{engine::any::Any, Surreal, Uuid}; -use tokio::{runtime::Builder, signal::ctrl_c, time::interval_at}; +use tokio::{runtime::Builder, signal::ctrl_c}; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; use tracing::error; use tracing_subscriber::{prelude::*, EnvFilter}; @@ -151,7 +151,7 @@ fn init_logger() -> SdkLoggerProvider { fn init_meter() -> SdkMeterProvider { let otlp_metric_exporter = MetricExporter::builder() .with_tonic() - .with_temporality(opentelemetry_sdk::metrics::Temporality::default()) + .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) .build() .unwrap(); @@ -273,22 +273,6 @@ async fn application_main(args: Args) -> anyhow::Result<()> { // Start exporting system metrics tokio::task::spawn(export_system_metrics()); - // // Create a tracing layer with the configured tracer - // let telemetry = - // tracing_opentelemetry::layer().with_tracer(otel_guard.tracer_provider.tracer("testingsss")); - - // // Use the tracing subscriber `Registry`, or any other subscriber - // // that impls `LookupSpan` - // let subscriber = Registry::default().with(telemetry); - // // Trace executed code - // tracing::subscriber::with_default(subscriber, || { - // // Spans will be sent to the configured OpenTelemetry exporter - // let root = span!(tracing::Level::TRACE, "app_start", work_units = 2); - // let _enter = root.enter(); - - // error!("This event will be logged in the root span."); - // }); - // connect to the database let db = database::connect(args.db, &args.username, &args.password) .await @@ -324,33 +308,6 @@ async fn application_main(args: Args) -> anyhow::Result<()> { .context("Failed to spawn jetstream consumer thread")?; } - let db_clone = db.clone(); - tokio::spawn(async move { - let indexed_repos_counter = global::meter("indexer") - .u64_counter("indexer.repos.indexed") - .with_description("Total number of indexed repos") - .with_unit("{repo}") - .build(); - let mut last_count: u64 = 0; - let mut interval = interval_at( - tokio::time::Instant::now(), - tokio::time::Duration::from_secs(2), - ); - loop { - let mut res = db_clone - .query("SELECT count() as c FROM li_did GROUP ALL;") - .await - .unwrap(); - let count: Option = res.take((0, "c")).unwrap(); - let count = count.unwrap_or(last_count as i64) as u64; - - indexed_repos_counter.add(count - last_count, &[]); - last_count = count; - - interval.tick().await; - } - }); - if args.mode == "full" { start_full_repo_indexer(&db).await?; } else { From 3e6a1ef5ce6c83ad5f156e429d086c5447e4e095 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 20:12:50 +0100 Subject: [PATCH 19/75] Split main.rs into multiple files --- src/database/repo_indexer.rs | 7 +- src/jetstream_consumer.rs | 58 ++++++++ src/main.rs | 278 ++--------------------------------- src/metrics_reporter.rs | 2 +- src/observability.rs | 205 ++++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 268 deletions(-) create mode 100644 src/jetstream_consumer.rs create mode 100644 src/observability.rs diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 04c67a8..1111449 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -30,9 +30,9 @@ pub struct LastIndexedTimestamp { /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; /// The size of the buffer between each pipeline stage in elements -const BUFFER_SIZE: usize = 200; +const BUFFER_SIZE: usize = 1; -pub async fn start_full_repo_indexer(db: &Surreal) -> anyhow::Result<()> { +pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let http_client = Client::new(); let meter = global::meter("indexer"); @@ -115,5 +115,6 @@ pub async fn start_full_repo_indexer(db: &Surreal) -> anyhow::Result<()> { }) .await; - panic!("Done, this should not happen"); + // panic!("Done, this should not happen"); + Ok(()) } diff --git a/src/jetstream_consumer.rs b/src/jetstream_consumer.rs new file mode 100644 index 0000000..253eaab --- /dev/null +++ b/src/jetstream_consumer.rs @@ -0,0 +1,58 @@ +use anyhow::Context; +use surrealdb::{engine::any::Any, Surreal}; +use tokio::runtime::Builder; + +use crate::{database, websocket}; + +pub async fn attach_jetstream(db: Surreal, certificate: String) -> anyhow::Result<()> { + let jetstream_hosts = vec![ + "jetstream1.us-west.bsky.network", + "jetstream2.us-east.bsky.network", + "test-jetstream.skyfeed.moe", + "jetstream2.us-west.bsky.network", + "jetstream1.us-east.bsky.network", + ]; + + for host in jetstream_hosts { + let db_clone = db.clone(); + let certificate = certificate.clone(); + let (name, _) = host.split_at(18); + std::thread::Builder::new() + .name(format!("{}", name)) + .spawn(move || { + Builder::new_current_thread() + .enable_io() + .enable_time() + .build() + .unwrap() + .block_on(async { + start_jetstream_consumer(db_clone, host.to_string(), certificate) + .await + .context("jetstream consumer failed") + .unwrap(); + }); + }) + .context("Failed to spawn jetstream consumer thread")?; + } + + Ok(()) +} + +async fn start_jetstream_consumer( + db: Surreal, + host: String, + certificate: String, +) -> anyhow::Result<()> { + // fetch initial cursor + let cursor = database::fetch_cursor(&db, &host) + .await + .context("Failed to fetch cursor from database")? + .map_or(0, |e| e.time_us); + + // enter websocket event loop + websocket::start(host, certificate, cursor, db) + .await + .context("WebSocket event loop failed")?; + + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 39609a0..36b047a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,211 +3,28 @@ use anyhow::Context; use config::Args; use database::repo_indexer::start_full_repo_indexer; +use jetstream_consumer::attach_jetstream; use metrics_reporter::export_system_metrics; -use opentelemetry::{global, trace::TracerProvider as _, KeyValue}; -use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; -use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; -use opentelemetry_resource_detectors::{ - HostResourceDetector, OsResourceDetector, ProcessResourceDetector, -}; -use opentelemetry_sdk::{ - logs::SdkLoggerProvider, - metrics::{PeriodicReader, SdkMeterProvider}, - propagation::TraceContextPropagator, - resource::EnvResourceDetector, - trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, - Resource, -}; -use opentelemetry_semantic_conventions::{ - attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, - resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION, SERVICE_INSTANCE_ID}, - SCHEMA_URL, -}; +use observability::init_observability; use std::{ process::exit, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, LazyLock, - }, + sync::atomic::{AtomicUsize, Ordering}, }; -use surrealdb::{engine::any::Any, Surreal, Uuid}; -use tokio::{runtime::Builder, signal::ctrl_c}; +use tokio::runtime::Builder; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; use tracing::error; -use tracing_subscriber::{prelude::*, EnvFilter}; mod config; mod database; +mod jetstream_consumer; mod metrics_reporter; +mod observability; mod websocket; /// Override the global allocator with mimalloc #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -const RESOURCE: LazyLock = LazyLock::new(|| { - let instance_id = Uuid::new_v4(); - - let mut attributes = vec![ - KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), - KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), - KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), - ]; - - if let Ok(linux_sys_info) = sys_info::linux_os_release() { - if let Some(build_id) = linux_sys_info.build_id { - attributes.push(KeyValue::new(OS_BUILD_ID, build_id)); - } - if let Some(pretty_name) = linux_sys_info.pretty_name { - attributes.push(KeyValue::new(OS_DESCRIPTION, pretty_name)); - } - if let Some(name) = linux_sys_info.name { - attributes.push(KeyValue::new(OS_NAME, name)); - } - if let Some(version_id) = linux_sys_info.version_id { - attributes.push(KeyValue::new(OS_VERSION, version_id)); - } - } else { - if let Ok(os_version) = sys_info::os_release() { - attributes.push(KeyValue::new(OS_DESCRIPTION, os_version)); - } - if let Ok(os_name) = sys_info::os_type() { - attributes.push(KeyValue::new(OS_NAME, os_name)); - } - } - - if let Ok(hostname) = sys_info::hostname() { - attributes.push(KeyValue::new(HOST_NAME, hostname)); - } - - Resource::builder() - .with_schema_url( - [ - KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), - KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), - ], - SCHEMA_URL, - ) - .with_attributes(attributes) - .with_detectors(&[ - Box::new(EnvResourceDetector::new()), - Box::new(HostResourceDetector::default()), - Box::new(ProcessResourceDetector), - Box::new(OsResourceDetector), - // Box::new(OsResourceDetector::new()), - ]) - .build() -}); - -fn init_observability() -> OtelGuard { - let tracer_provider = init_tracer(); - let meter_provider = init_meter(); - let logger_provider = init_logger(); - - let otel_log_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let otel_log_layer = - OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); - - let tokio_console_layer = console_subscriber::spawn(); - - let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); - let stdout_layer = tracing_subscriber::fmt::layer() - .with_thread_names(true) - .with_filter(stdout_filter); - - let tracer = tracer_provider.tracer("tracing-otel-subscriber"); - tracing_subscriber::registry() - .with(tokio_console_layer) - .with(otel_log_layer) - .with(stdout_layer) - .with(tracing_opentelemetry::MetricsLayer::new( - meter_provider.clone(), - )) - .with(tracing_opentelemetry::OpenTelemetryLayer::new(tracer)) - .init(); - OtelGuard { - tracer_provider, - meter_provider, - logger_provider, - } -} - -fn init_logger() -> SdkLoggerProvider { - let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); - let logger_provider = SdkLoggerProvider::builder() - .with_resource(RESOURCE.clone()) - .with_batch_exporter(otlp_log_exporter) - .build(); - logger_provider -} - -fn init_meter() -> SdkMeterProvider { - let otlp_metric_exporter = MetricExporter::builder() - .with_tonic() - .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) - .build() - .unwrap(); - - let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) - .with_interval(std::time::Duration::from_secs(5)) - .build(); - - let meter_provider = SdkMeterProvider::builder() - .with_resource(RESOURCE.clone()) - .with_reader(periodic_reader) - .build(); - global::set_meter_provider(meter_provider.clone()); - - meter_provider -} - -fn init_tracer() -> SdkTracerProvider { - global::set_text_map_propagator(TraceContextPropagator::new()); - - let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); - - let tracer_provider = SdkTracerProvider::builder() - .with_resource(RESOURCE.clone()) - .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( - 1.0, - )))) - .with_id_generator(RandomIdGenerator::default()) - .with_batch_exporter(otlp_span_exporter) - // .with_simple_exporter(otlp_span_exporter) - .build(); - global::set_tracer_provider(tracer_provider.clone()); - - tracer_provider -} - -struct OtelGuard { - tracer_provider: SdkTracerProvider, - meter_provider: SdkMeterProvider, - logger_provider: SdkLoggerProvider, -} - -impl Drop for OtelGuard { - fn drop(&mut self) { - eprintln!("Shutting down observability"); - if let Err(err) = self.tracer_provider.shutdown() { - eprintln!("{err:?}"); - } - if let Err(err) = self.meter_provider.shutdown() { - eprintln!("{err:?}"); - } - if let Err(err) = self.logger_provider.shutdown() { - eprintln!("{err:?}"); - } - } -} - /// Entry point for the application fn main() { let args = config::parse_args(); @@ -246,94 +63,27 @@ fn main() { let err = rt.block_on(application_main(args)); if let Err(e) = &err { error!(target: "indexer", "{:?}", e); + exit(1); } - - // exit - std::process::exit(if err.is_ok() { 0 } else { 1 }); } /// Asynchronous main function async fn application_main(args: Args) -> anyhow::Result<()> { - let otel_guard = Arc::new(init_observability()); - - let handler_otel_guard = otel_guard.clone(); - tokio::spawn(async move { - ctrl_c().await.unwrap(); - eprintln!("Preparing for unclean exit"); - - handler_otel_guard.logger_provider.shutdown().unwrap(); - handler_otel_guard.meter_provider.shutdown().unwrap(); - handler_otel_guard.tracer_provider.shutdown().unwrap(); - tokio::time::sleep(std::time::Duration::from_secs(1)).await; - - eprintln!("Exiting"); - exit(1); - }); - - // Start exporting system metrics - tokio::task::spawn(export_system_metrics()); + let _otel_guard = init_observability().await; // connect to the database let db = database::connect(args.db, &args.username, &args.password) .await .context("Failed to connect to the database")?; - let jetstream_hosts = vec![ - "jetstream1.us-west.bsky.network", - "jetstream2.us-east.bsky.network", - "test-jetstream.skyfeed.moe", - "jetstream2.us-west.bsky.network", - "jetstream1.us-east.bsky.network", - ]; - - for host in jetstream_hosts { - let db_clone = db.clone(); - let certificate = args.certificate.clone(); - let (name, _) = host.split_at(18); - std::thread::Builder::new() - .name(format!("{}", name)) - .spawn(move || { - Builder::new_current_thread() - .enable_io() - .enable_time() - .build() - .unwrap() - .block_on(async { - start_jetstream_consumer(db_clone, host.to_string(), certificate) - .await - .context("jetstream consumer failed") - .unwrap(); - }); - }) - .context("Failed to spawn jetstream consumer thread")?; - } - + let metrics_task = tokio::spawn(export_system_metrics()); + let jetstream_task = tokio::spawn(attach_jetstream(db.clone(), args.certificate.clone())); if args.mode == "full" { - start_full_repo_indexer(&db).await?; - } else { - loop { - tokio::time::sleep(std::time::Duration::from_millis(1000)).await; - } + start_full_repo_indexer(db.clone()).await?; } - Ok(()) -} - -async fn start_jetstream_consumer( - db: Surreal, - host: String, - certificate: String, -) -> anyhow::Result<()> { - // fetch initial cursor - let cursor = database::fetch_cursor(&db, &host) - .await - .context("Failed to fetch cursor from database")? - .map_or(0, |e| e.time_us); - - // enter websocket event loop - websocket::start(host, certificate, cursor, db) - .await - .context("WebSocket event loop failed")?; - + // TODO: To something smart if one of the tasks exits + metrics_task.await??; + jetstream_task.await??; Ok(()) } diff --git a/src/metrics_reporter.rs b/src/metrics_reporter.rs index 634daaf..753e2ee 100644 --- a/src/metrics_reporter.rs +++ b/src/metrics_reporter.rs @@ -20,7 +20,7 @@ use tokio::{ }; const METRICS_INTERVAL: Duration = Duration::from_secs(2); -pub async fn export_system_metrics() { +pub async fn export_system_metrics() -> anyhow::Result<()> { let meter = global::meter("system"); let mut system = System::new_all(); diff --git a/src/observability.rs b/src/observability.rs new file mode 100644 index 0000000..9c534e3 --- /dev/null +++ b/src/observability.rs @@ -0,0 +1,205 @@ +use opentelemetry::{global, trace::TracerProvider as _, KeyValue}; +use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; +use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; +use opentelemetry_resource_detectors::{ + HostResourceDetector, OsResourceDetector, ProcessResourceDetector, +}; +use opentelemetry_sdk::{ + logs::SdkLoggerProvider, + metrics::{PeriodicReader, SdkMeterProvider}, + propagation::TraceContextPropagator, + resource::EnvResourceDetector, + trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, + Resource, +}; +use opentelemetry_semantic_conventions::{ + attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, + resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION, SERVICE_INSTANCE_ID}, + SCHEMA_URL, +}; +use std::{ + process::exit, + sync::{Arc, LazyLock}, +}; +use surrealdb::Uuid; +use tokio::signal::ctrl_c; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer}; + +const RESOURCE: LazyLock = LazyLock::new(|| { + let instance_id = Uuid::new_v4(); + + let mut attributes = vec![ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ]; + + if let Ok(linux_sys_info) = sys_info::linux_os_release() { + if let Some(build_id) = linux_sys_info.build_id { + attributes.push(KeyValue::new(OS_BUILD_ID, build_id)); + } + if let Some(pretty_name) = linux_sys_info.pretty_name { + attributes.push(KeyValue::new(OS_DESCRIPTION, pretty_name)); + } + if let Some(name) = linux_sys_info.name { + attributes.push(KeyValue::new(OS_NAME, name)); + } + if let Some(version_id) = linux_sys_info.version_id { + attributes.push(KeyValue::new(OS_VERSION, version_id)); + } + } else { + if let Ok(os_version) = sys_info::os_release() { + attributes.push(KeyValue::new(OS_DESCRIPTION, os_version)); + } + if let Ok(os_name) = sys_info::os_type() { + attributes.push(KeyValue::new(OS_NAME, os_name)); + } + } + + if let Ok(hostname) = sys_info::hostname() { + attributes.push(KeyValue::new(HOST_NAME, hostname)); + } + + Resource::builder() + .with_schema_url( + [ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ], + SCHEMA_URL, + ) + .with_attributes(attributes) + .with_detectors(&[ + Box::new(EnvResourceDetector::new()), + Box::new(HostResourceDetector::default()), + Box::new(ProcessResourceDetector), + Box::new(OsResourceDetector), + // Box::new(OsResourceDetector::new()), + ]) + .build() +}); + +pub async fn init_observability() -> Arc { + let tracer_provider = init_tracer(); + let meter_provider = init_meter(); + let logger_provider = init_logger(); + + let otel_log_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let otel_log_layer = + OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); + + let tokio_console_layer = console_subscriber::spawn(); + + let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + let stdout_layer = tracing_subscriber::fmt::layer() + .with_thread_names(true) + .with_filter(stdout_filter); + + let tracer = tracer_provider.tracer("tracing-otel-subscriber"); + tracing_subscriber::registry() + .with(tokio_console_layer) + .with(otel_log_layer) + .with(stdout_layer) + .with(tracing_opentelemetry::MetricsLayer::new( + meter_provider.clone(), + )) + .with(tracing_opentelemetry::OpenTelemetryLayer::new(tracer)) + .init(); + + // TODO: Replace this hacky mess with something less broken + let guard = Arc::new(OtelGuard { + tracer_provider, + meter_provider, + logger_provider, + }); + let handler_otel_guard = guard.clone(); + tokio::spawn(async move { + ctrl_c().await.unwrap(); + eprintln!("Preparing for unclean exit"); + + handler_otel_guard.logger_provider.shutdown().unwrap(); + handler_otel_guard.meter_provider.shutdown().unwrap(); + handler_otel_guard.tracer_provider.shutdown().unwrap(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + eprintln!("Exiting"); + exit(1); + }); + guard +} + +fn init_logger() -> SdkLoggerProvider { + let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); + let logger_provider = SdkLoggerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_batch_exporter(otlp_log_exporter) + .build(); + logger_provider +} + +fn init_meter() -> SdkMeterProvider { + let otlp_metric_exporter = MetricExporter::builder() + .with_tonic() + .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) + .build() + .unwrap(); + + let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) + .with_interval(std::time::Duration::from_secs(5)) + .build(); + + let meter_provider = SdkMeterProvider::builder() + .with_resource(RESOURCE.clone()) + .with_reader(periodic_reader) + .build(); + global::set_meter_provider(meter_provider.clone()); + + meter_provider +} + +fn init_tracer() -> SdkTracerProvider { + global::set_text_map_propagator(TraceContextPropagator::new()); + + let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); + + let tracer_provider = SdkTracerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( + 1.0, + )))) + .with_id_generator(RandomIdGenerator::default()) + .with_batch_exporter(otlp_span_exporter) + // .with_simple_exporter(otlp_span_exporter) + .build(); + global::set_tracer_provider(tracer_provider.clone()); + + tracer_provider +} + +pub struct OtelGuard { + tracer_provider: SdkTracerProvider, + meter_provider: SdkMeterProvider, + logger_provider: SdkLoggerProvider, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + eprintln!("Shutting down observability"); + if let Err(err) = self.tracer_provider.shutdown() { + eprintln!("{err:?}"); + } + if let Err(err) = self.meter_provider.shutdown() { + eprintln!("{err:?}"); + } + if let Err(err) = self.logger_provider.shutdown() { + eprintln!("{err:?}"); + } + } +} From 6c96c097092d2a0d907bfb6655a0a27c0e1a9792 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 22:02:52 +0100 Subject: [PATCH 20/75] Enable LTO for release builds --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0d500be..da487a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,8 +65,8 @@ sys-info = "0.9.1" sysinfo = "0.33.1" [profile.release] -lto = false -strip = true +lto = true +strip = false opt-level = 3 panic = 'abort' codegen-units = 1 From d4aff7a130d49428b1e98eee9bfe00b3d51b4035 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 22:08:14 +0100 Subject: [PATCH 21/75] Document tracing layers --- src/observability.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/observability.rs b/src/observability.rs index 9c534e3..e3afdc0 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -86,6 +86,7 @@ pub async fn init_observability() -> Arc { let meter_provider = init_meter(); let logger_provider = init_logger(); + // Exports logs to otel let otel_log_filter = EnvFilter::new("info") .add_directive("hyper=off".parse().unwrap()) .add_directive("h2=off".parse().unwrap()) @@ -95,14 +96,27 @@ pub async fn init_observability() -> Arc { let otel_log_layer = OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); + // Exports tokio stats for tokio-console let tokio_console_layer = console_subscriber::spawn(); + // Prints logs to stdout let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); let stdout_layer = tracing_subscriber::fmt::layer() .with_thread_names(true) .with_filter(stdout_filter); + // Exports tracing traces to opentelemetry + let tracing_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); let tracer = tracer_provider.tracer("tracing-otel-subscriber"); + let tracing_layer = + tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); + + // Add all layers tracing_subscriber::registry() .with(tokio_console_layer) .with(otel_log_layer) @@ -110,7 +124,7 @@ pub async fn init_observability() -> Arc { .with(tracing_opentelemetry::MetricsLayer::new( meter_provider.clone(), )) - .with(tracing_opentelemetry::OpenTelemetryLayer::new(tracer)) + .with(tracing_layer) .init(); // TODO: Replace this hacky mess with something less broken From 42b6786190cd1003e2b313cbb5c3dcdb3ec14f5c Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 22:09:24 +0100 Subject: [PATCH 22/75] Instrument on_commit_event_createorupdate --- src/database/handlers.rs | 102 +++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 6dd550f..4b66d6f 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -9,8 +9,9 @@ use atrium_api::{ }, }; use chrono::Utc; +use std::future::IntoFuture; use surrealdb::{engine::any::Any, RecordId, Surreal}; -use tracing::warn; +use tracing::{instrument, span, warn, Instrument, Level}; use crate::websocket::events::{Commit, Kind}; @@ -92,6 +93,7 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { } /// If the new commit is a create or update, handle it +#[instrument(skip(db, record))] pub async fn on_commit_event_createorupdate( db: &Surreal, did: Did, @@ -132,7 +134,12 @@ pub async fn on_commit_event_createorupdate( extra_data: process_extra_data(&d.extra_data)?, }; // TODO this should be a db.upsert(...).merge(...) - let _: Option = db.upsert(("did", did_key)).content(profile).await?; + let _: Option = db + .upsert(("did", did_key)) + .content(profile) + .into_future() + .instrument(span!(Level::INFO, "upsert")) + .await?; } KnownRecord::AppBskyGraphFollow(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -146,7 +153,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyFeedLike(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -160,7 +171,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyFeedRepost(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -174,7 +189,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyGraphBlock(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -188,7 +207,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyGraphListblock(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -202,7 +225,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyGraphListitem(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -218,7 +245,11 @@ pub async fn on_commit_event_createorupdate( from, to, id, created_at ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } KnownRecord::AppBskyFeedGenerator(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -238,7 +269,12 @@ pub async fn on_commit_event_createorupdate( ), extra_data: process_extra_data(&d.extra_data)?, }; - let _: Option = db.upsert(("feed", id)).content(feed).await?; + let _: Option = db + .upsert(("feed", id)) + .content(feed) + .into_future() + .instrument(span!(Level::INFO, "upsert")) + .await?; } KnownRecord::AppBskyGraphList(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -256,7 +292,12 @@ pub async fn on_commit_event_createorupdate( purpose: d.purpose.clone(), extra_data: process_extra_data(&d.extra_data)?, }; - let _: Option = db.upsert(("list", id)).content(list).await?; + let _: Option = db + .upsert(("list", id)) + .content(list) + .into_future() + .instrument(span!(Level::INFO, "upsert")) + .await?; } KnownRecord::AppBskyFeedThreadgate(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -264,6 +305,8 @@ pub async fn on_commit_event_createorupdate( let _: Option = db .upsert(("lex_app_bsky_feed_threadgate", id)) .content(d) + .into_future() + .instrument(span!(Level::INFO, "upsert")) .await?; } KnownRecord::AppBskyGraphStarterpack(d) => { @@ -272,6 +315,8 @@ pub async fn on_commit_event_createorupdate( let _: Option = db .upsert(("lex_app_bsky_graph_starterpack", id)) .content(d) + .into_future() + .instrument(span!(Level::INFO, "upsert")) .await?; } KnownRecord::AppBskyFeedPostgate(d) => { @@ -280,6 +325,8 @@ pub async fn on_commit_event_createorupdate( let _: Option = db .upsert(("lex_app_bsky_feed_postgate", id)) .content(d) + .into_future() + .instrument(span!(Level::INFO, "upsert")) .await?; } KnownRecord::ChatBskyActorDeclaration(d) => { @@ -288,6 +335,8 @@ pub async fn on_commit_event_createorupdate( let _: Option = db .upsert(("lex_chat_bsky_actor_declaration", id)) .content(d) + .into_future() + .instrument(span!(Level::INFO, "upsert")) .await?; } KnownRecord::AppBskyLabelerService(d) => { @@ -296,6 +345,8 @@ pub async fn on_commit_event_createorupdate( let _: Option = db .upsert(("lex_app_bsky_labeler_service", id)) .content(d) + .into_future() + .instrument(span!(Level::INFO, "upsert")) .await?; } KnownRecord::AppBskyFeedPost(d) => { @@ -379,7 +430,11 @@ pub async fn on_commit_event_createorupdate( id ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } } @@ -449,14 +504,23 @@ pub async fn on_commit_event_createorupdate( extra_data: process_extra_data(&d.extra_data)?, }; let parent = post.parent.clone(); - let _: Option = db.upsert(("post", id.clone())).content(post).await?; + let _: Option = db + .upsert(("post", id.clone())) + .content(post) + .into_future() + .instrument(span!(Level::INFO, "upsert")) + .await?; if parent.is_some() { let query1 = format!( "RELATE did:{}->replies->post:{} SET id = '{}';", did_key, id, id ); - let _ = db.query(query1).await?; + let _ = db + .query(query1) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; let query2 = format!( "RELATE post:{}->replyto->{} SET id = '{}';", @@ -464,13 +528,21 @@ pub async fn on_commit_event_createorupdate( parent.unwrap(), id ); - let _ = db.query(query2).await?; + let _ = db + .query(query2) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } else { let query = format!( "RELATE did:{}->posts->post:{} SET id = '{}';", did_key, id, id ); - let _ = db.query(query).await?; + let _ = db + .query(query) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; } } _ => { From 6527dec1f9a5e46a6e8062db1212b77c0de5cfbd Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 25 Feb 2025 22:15:19 +0100 Subject: [PATCH 23/75] Instrument index_repo --- src/database/repo_indexer/index_repo.rs | 67 ++++++++++++++----------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 66ce3d7..53cd7e8 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -14,7 +14,7 @@ use serde_ipld_dagcbor::from_reader; use std::{collections::BTreeMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; use tokio::task::spawn_blocking; -use tracing::{trace, warn}; +use tracing::{info, instrument, span, trace, warn, Level, Span}; /// There should only be one request client to make use of connection pooling // TODO: Dont use a global client @@ -85,8 +85,9 @@ async fn insert_into_map( } /// Convert downloaded files into database updates. Blocks the thread +#[instrument(skip_all)] fn files_to_updates_blocking( - files: BTreeMap>, + files: BTreeMap>, ) -> Result, FromUtf8Error> { // TODO: Understand this logic and whether this can be done streaming let mut result = Vec::new(); @@ -121,6 +122,7 @@ fn files_to_updates_blocking( } /// Check if a repo is already indexed +#[instrument()] async fn check_indexed(db: &Surreal, did: &str) -> anyhow::Result { let did_key = crate::database::utils::did_to_key(did)?; @@ -131,6 +133,7 @@ async fn check_indexed(db: &Surreal, did: &str) -> anyhow::Result { } /// Get the plc response service for the repo +#[instrument(skip_all)] async fn get_plc_service( http_client: &Client, did: &str, @@ -146,6 +149,7 @@ async fn get_plc_service( } /// Download a repo from the given service +#[instrument(skip_all)] async fn download_repo( service: &PlcDirectoryDidResponseService, did: &str, @@ -158,17 +162,17 @@ async fn download_repo( .send() .await?; let bytes = get_repo_response.bytes().await?; + info!( + "Downloaded repo {} with size {:.2} MB", + did, + bytes.len() as f64 / (1000.0 * 1000.0) + ); return Ok(bytes); } /// Download the file for the given repo into a map -async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { - // let reader = StreamReader::new(bytes.as_ref()); - // TODO: Figure out what the second parameter does - // let reader = rs_car_sync::CarReader::new(&car_res_bytes, false); - - // let buf_reader = tokio::io::BufReader::new(&car_res_bytes[..]); - +#[instrument(skip_all)] +async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { // TODO: Benchmark CarReader. This is probably not the right place for parsing logic let car_reader = CarReader::new(bytes.as_ref()).await?; let files = car_reader @@ -181,15 +185,15 @@ async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>, -) -> anyhow::Result> { +#[instrument(skip_all)] +async fn files_to_updates(files: BTreeMap>) -> anyhow::Result> { // TODO: Look into using block_in_place instead of spawn_blocking let result = spawn_blocking(|| files_to_updates_blocking(files)).await??; Ok(result) } /// Apply updates to the database +#[instrument(skip_all)] async fn apply_updates( db: &Surreal, did: &str, @@ -284,6 +288,7 @@ pub struct PipelineItem<'a, State> { db: &'a Surreal, http_client: &'a Client, did: String, + span: Span, state: State, } @@ -293,31 +298,37 @@ impl<'a> PipelineItem<'a, New> { http_client: &'a Client, did: String, ) -> PipelineItem<'a, New> { + let span = span!(target: "backfill", parent: None, Level::INFO, "pipeline_item"); + span.record("did", did.clone()); + span.in_scope(|| { + trace!("Start backfilling repo"); + }); PipelineItem::<'a, New> { db, http_client, did, + span, state: New {}, } } } impl<'a> PipelineItem<'a, New> { + #[instrument(skip(self), parent = &self.span)] pub async fn check_indexed(self) -> anyhow::Result> { if check_indexed(&self.db, &self.did).await? { // TODO: Handle this better, as this is not really an error return Err(anyhow::anyhow!("Already indexed")); } Ok(PipelineItem::<'a, NotIndexed> { - db: self.db, - http_client: self.http_client, - did: self.did, state: NotIndexed {}, + ..self }) } } impl<'a> PipelineItem<'a, NotIndexed> { + #[instrument(skip(self), parent = &self.span)] pub async fn get_service(self) -> anyhow::Result> { let service = get_plc_service(&self.http_client, &self.did).await?; let Some(service) = service else { @@ -325,77 +336,73 @@ impl<'a> PipelineItem<'a, NotIndexed> { return Err(anyhow::anyhow!("Failed to get a plc service")); }; Ok(PipelineItem::<'a, WithService> { - db: self.db, - http_client: self.http_client, - did: self.did, state: WithService { service: service, now: std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap(), }, + ..self }) } } impl<'a> PipelineItem<'a, WithService> { + #[instrument(skip(self), parent = &self.span)] pub async fn download_repo(self) -> anyhow::Result> { let repo = download_repo(&self.state.service, &self.did).await?; Ok(PipelineItem::<'a, WithRepo> { - db: self.db, - http_client: self.http_client, - did: self.did, state: WithRepo { now: self.state.now, repo, }, + ..self }) } } impl<'a> PipelineItem<'a, WithRepo> { + #[instrument(skip(self), parent = &self.span)] pub async fn deserialize_repo(self) -> anyhow::Result> { + info!("Deserializing repo {}", self.did); let files = deserialize_repo(self.state.repo).await?; Ok(PipelineItem::<'a, WithFiles> { - db: self.db, - http_client: self.http_client, - did: self.did, state: WithFiles { now: self.state.now, files, }, + ..self }) } } impl<'a> PipelineItem<'a, WithFiles> { + #[instrument(skip(self), parent = &self.span)] pub async fn files_to_updates(self) -> anyhow::Result> { let updates = files_to_updates(self.state.files).await?; Ok(PipelineItem::<'a, WithUpdates> { - db: self.db, - http_client: self.http_client, - did: self.did, state: WithUpdates { now: self.state.now, updates, }, + ..self }) } } impl<'a> PipelineItem<'a, WithUpdates> { + #[instrument(skip(self), parent = &self.span)] pub async fn apply_updates(self) -> anyhow::Result> { apply_updates(&self.db, &self.did, self.state.updates, &self.state.now).await?; Ok(PipelineItem::<'a, Done> { - db: self.db, - http_client: self.http_client, - did: self.did, state: Done {}, + ..self }) } } impl<'a> PipelineItem<'a, Done> { + #[instrument(skip(self), parent = &self.span)] pub async fn print_report(self) -> () { // TODO: This is only for printing debug stuff trace!("Indexed {}", self.did); From 9b0a15d20d2353fc09becd9de9a742a09441fdc2 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 26 Feb 2025 15:52:04 +0100 Subject: [PATCH 24/75] Add curl to the nix flake --- flake.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/flake.nix b/flake.nix index e56f0ac..38f6f4b 100644 --- a/flake.nix +++ b/flake.nix @@ -56,6 +56,7 @@ pkgs.rust-analyzer-nightly pkgs.openssl + pkgs.curl pkgs.pkg-config pkgs.clang From 6a7497983d44e3f0838a2e0680800a8fe7c96325 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 26 Feb 2025 15:55:12 +0100 Subject: [PATCH 25/75] Add multiplier for the download buffer size --- src/database/repo_indexer.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 1111449..b2bc04f 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -30,7 +30,9 @@ pub struct LastIndexedTimestamp { /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; /// The size of the buffer between each pipeline stage in elements -const BUFFER_SIZE: usize = 1; +const BUFFER_SIZE: usize = 30; +/// Buffer size multiplier for the download stage +const DOWNLOAD_BUFFER_SIZE: usize = 6; pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let http_client = Client::new(); @@ -78,7 +80,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { result.ok() }) .map(|item| async { item.download_repo().await }) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(BUFFER_SIZE * DOWNLOAD_BUFFER_SIZE) .filter_map(|result| async { if let Err(error) = &result { error!(target: "indexer", "Failed to index repo: {}", error); From 7f602ceb65a80265061e6141de486d06ba3d4f43 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 26 Feb 2025 16:45:46 +0100 Subject: [PATCH 26/75] Add more pipeline metrics --- src/database/repo_indexer.rs | 173 +++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 46 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index b2bc04f..0160428 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,6 +1,6 @@ use futures::StreamExt; use index_repo::PipelineItem; -use opentelemetry::global; +use opentelemetry::{global, KeyValue}; use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; @@ -34,6 +34,72 @@ const BUFFER_SIZE: usize = 30; /// Buffer size multiplier for the download stage const DOWNLOAD_BUFFER_SIZE: usize = 6; +// Make this less hacky +macro_rules! stage { + ($metric:ident, $stage:literal, $next:literal, $item:ident -> $content:expr) => { + |$item| async { + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "queued"), + ], + ); + $metric.add( + 1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + + let result = $content; + + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + $metric.add( + 1, + &[ + KeyValue::new("stage", $next), + KeyValue::new("state", "queued"), + ], + ); + + result + } + }; +} + +// Make this less hacky +macro_rules! filter_result { + ($metric:ident, $stage:literal) => {|result| async { + if let Err(error) = &result { + error!(target: "indexer", "Failed to index repo: {}", error); + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "queued"), + ], + ); + return None; + } + result.ok() + }}; +} + +// async fn filter_result(result: anyhow::Result) -> Option { +// if let Err(error) = &result { +// error!(target: "indexer", "Failed to index repo: {}", error); +// } +// result.ok() +// } + pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let http_client = Client::new(); @@ -43,6 +109,11 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .with_description("Total number of indexed repos") .with_unit("repo") .build(); + let tracker = meter + .i64_up_down_counter("indexer.pipeline.location") + .with_description("Track the number of tasks in the pipeline") + .with_unit("repo") + .build(); let mut res = db .query("SELECT count() as c FROM li_did GROUP ALL;") @@ -55,64 +126,74 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { repos_indexed.add(count as u64, &[]); RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) - .map(|did| async { did }) - .buffer_unordered(BUFFER_SIZE) - .map(|did| { + .map(|did| async { let db = &db; let http_client = &http_client; let item = PipelineItem::new(db, http_client, did); + + tracker.add( + 1, + &[ + KeyValue::new("stage", "check_indexed"), + KeyValue::new("state", "queued"), + ], + ); item }) - .map(|item| async { item.check_indexed().await }) .buffer_unordered(BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) - .map(|item| async { item.get_service().await }) + .map(stage!(tracker, "check_indexed", "get_service", item -> + item.check_indexed().await + )) .buffer_unordered(BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) - .map(|item| async { item.download_repo().await }) + .filter_map(filter_result!(tracker, "get_service")) + .map(stage!(tracker, "get_service", "download_repo", item -> + item.get_service().await + )) + .buffer_unordered(BUFFER_SIZE) + .filter_map(filter_result!(tracker, "download_repo")) + .map(stage!(tracker, "download_repo", "deserialize_repo", item -> + item.download_repo().await + )) .buffer_unordered(BUFFER_SIZE * DOWNLOAD_BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) - .map(|item| async { item.deserialize_repo().await }) + .filter_map(filter_result!(tracker, "deserialize_repo")) + .map(stage!(tracker, "deserialize_repo", "parse_repo", item -> + item.deserialize_repo().await + )) .buffer_unordered(BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) - .map(|item| async { item.files_to_updates().await }) + .filter_map(filter_result!(tracker, "files_to_updates")) + .map(stage!(tracker, "files_to_updates", "apply_updates", item -> + item.files_to_updates().await + )) .buffer_unordered(BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) - .map(|item| async { item.apply_updates().await }) + .filter_map(filter_result!(tracker, "apply_updates")) + .map(stage!(tracker, "apply_updates", "print_report", item -> + item.apply_updates().await + )) .buffer_unordered(BUFFER_SIZE) - .filter_map(|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - } - result.ok() - }) + .filter_map(filter_result!(tracker, "print_report")) .for_each(|x| async { + tracker.add( + -1, + &[ + KeyValue::new("stage", "print_report"), + KeyValue::new("state", "queued"), + ], + ); + tracker.add( + 1, + &[ + KeyValue::new("stage", "print_report"), + KeyValue::new("state", "active"), + ], + ); x.print_report().await; + tracker.add( + -1, + &[ + KeyValue::new("stage", "print_report"), + KeyValue::new("state", "active"), + ], + ); repos_indexed.add(1, &[]); }) .await; From fabb2582ca65c2bc530be362ae29873c17cfa13e Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 26 Feb 2025 16:54:07 +0100 Subject: [PATCH 27/75] Add server setup script --- setup.sh | 20 ++++++++++++++++++++ src/database/repo_indexer.rs | 1 + 2 files changed, 21 insertions(+) create mode 100644 setup.sh diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..e1bad1b --- /dev/null +++ b/setup.sh @@ -0,0 +1,20 @@ +apt update -y +apt install -y curl bash git btop htop nano clang llvm openssl libssl-dev pkg-config + +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs >rustup.sh +sh rustup.sh -y +rm rustup.sh +. "$HOME/.cargo/env" +rustup toolchain install nightly +rustup default nightly + +if ! git status; then + git clone https://github.com/zebreus/indexer-rust + cd indexer-rust +fi + +export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291" +echo 'export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291"' >~/.bashrc + +cargo build +echo 'Done! Run `./target/debug/indexer-rust --db "rocksdb:///root/rocks/db" --mode full` to start the indexer.' diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 0160428..dd3182e 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -38,6 +38,7 @@ const DOWNLOAD_BUFFER_SIZE: usize = 6; macro_rules! stage { ($metric:ident, $stage:literal, $next:literal, $item:ident -> $content:expr) => { |$item| async { + // TODO: Dont create new keyvalues every time $metric.add( -1, &[ From fe9636026f2620bd80646f8b0dc0b3dc934f5d24 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 26 Feb 2025 23:33:53 +0100 Subject: [PATCH 28/75] Changes --- src/database/repo_indexer.rs | 12 +++-- src/database/repo_indexer/index_repo.rs | 64 +++++++++++++++---------- src/observability.rs | 10 ++-- 3 files changed, 51 insertions(+), 35 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index dd3182e..eea3f55 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -157,9 +157,11 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { )) .buffer_unordered(BUFFER_SIZE * DOWNLOAD_BUFFER_SIZE) .filter_map(filter_result!(tracker, "deserialize_repo")) - .map(stage!(tracker, "deserialize_repo", "parse_repo", item -> - item.deserialize_repo().await - )) + .map( + stage!(tracker, "deserialize_repo", "files_to_updates", item -> + item.deserialize_repo().await + ), + ) .buffer_unordered(BUFFER_SIZE) .filter_map(filter_result!(tracker, "files_to_updates")) .map(stage!(tracker, "files_to_updates", "apply_updates", item -> @@ -168,7 +170,9 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .buffer_unordered(BUFFER_SIZE) .filter_map(filter_result!(tracker, "apply_updates")) .map(stage!(tracker, "apply_updates", "print_report", item -> - item.apply_updates().await + { + // println!("Items: {:?}", item.state.updates.len()); + item.apply_updates().await} )) .buffer_unordered(BUFFER_SIZE) .filter_map(filter_result!(tracker, "print_report")) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 53cd7e8..af1af43 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -4,14 +4,14 @@ use atrium_api::{ record::KnownRecord, types::string::{Did, RecordKey}, }; -use futures::TryStreamExt; +use futures::{stream::FuturesUnordered, TryStreamExt}; use hyper::body::Bytes; use ipld_core::cid::{Cid, CidGeneric}; use iroh_car::CarReader; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{collections::BTreeMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; +use std::{collections::HashMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; use tokio::task::spawn_blocking; use tracing::{info, instrument, span, trace, warn, Level, Span}; @@ -68,7 +68,7 @@ pub struct NodeData { pub entries: Vec, } -struct DatabaseUpdate { +pub struct DatabaseUpdate { collection: String, rkey: RecordKey, record: KnownRecord, @@ -76,9 +76,9 @@ struct DatabaseUpdate { /// Insert a file into a map async fn insert_into_map( - mut files: BTreeMap>, + mut files: HashMap>, file: (CidGeneric<64>, Vec), -) -> anyhow::Result>> { +) -> anyhow::Result>> { let (cid, data) = file; files.insert(cid, data); Ok(files) @@ -87,7 +87,7 @@ async fn insert_into_map( /// Convert downloaded files into database updates. Blocks the thread #[instrument(skip_all)] fn files_to_updates_blocking( - files: BTreeMap>, + files: HashMap>, ) -> Result, FromUtf8Error> { // TODO: Understand this logic and whether this can be done streaming let mut result = Vec::new(); @@ -172,13 +172,13 @@ async fn download_repo( /// Download the file for the given repo into a map #[instrument(skip_all)] -async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { +async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { // TODO: Benchmark CarReader. This is probably not the right place for parsing logic let car_reader = CarReader::new(bytes.as_ref()).await?; let files = car_reader .stream() .map_err(|e| e.into()) - .try_fold(BTreeMap::new(), insert_into_map) + .try_fold(HashMap::new(), insert_into_map) .await; files @@ -186,7 +186,7 @@ async fn deserialize_repo(bytes: Bytes) -> anyhow::Result> /// Convert downloaded files into database updates #[instrument(skip_all)] -async fn files_to_updates(files: BTreeMap>) -> anyhow::Result> { +async fn files_to_updates(files: HashMap>) -> anyhow::Result> { // TODO: Look into using block_in_place instead of spawn_blocking let result = spawn_blocking(|| files_to_updates_blocking(files)).await??; Ok(result) @@ -202,21 +202,33 @@ async fn apply_updates( ) -> anyhow::Result<()> { let did_key = crate::database::utils::did_to_key(did)?; - for update in updates { - let res = on_commit_event_createorupdate( - db, - Did::new(did.into()).unwrap(), - did_key.clone(), - update.collection, - update.rkey, - update.record, - ) - .await; - - if let Err(error) = res { - warn!("on_commit_event_createorupdate {} {}", error, did); - } + let futures: Vec<_> = updates + .into_iter() + .map(|update| { + let db = db.clone(); + let did_key = did_key.clone(); + let did = did.to_string(); + tokio::spawn(async move { + let res = on_commit_event_createorupdate( + &db, + Did::new(did.clone().into()).unwrap(), + did_key, + update.collection, + update.rkey, + update.record, + ) + .await; + + if let Err(error) = res { + warn!("on_commit_event_createorupdate {} {}", error, did); + } + }) + }) + .collect(); + for f in futures.into_iter() { + f.await; } + let _: Option = db .upsert(("li_did", did_key)) .content(LastIndexedTimestamp { @@ -274,12 +286,12 @@ pub struct WithRepo { pub struct WithFiles { now: std::time::Duration, - files: BTreeMap>, + files: HashMap>, } /// Has converted the files to update pub struct WithUpdates { now: std::time::Duration, - updates: Vec, + pub updates: Vec, } /// Updates have been applied pub struct Done {} @@ -289,7 +301,7 @@ pub struct PipelineItem<'a, State> { http_client: &'a Client, did: String, span: Span, - state: State, + pub state: State, } impl<'a> PipelineItem<'a, New> { diff --git a/src/observability.rs b/src/observability.rs index e3afdc0..4dceb6a 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -100,10 +100,10 @@ pub async fn init_observability() -> Arc { let tokio_console_layer = console_subscriber::spawn(); // Prints logs to stdout - let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); - let stdout_layer = tracing_subscriber::fmt::layer() - .with_thread_names(true) - .with_filter(stdout_filter); + // let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + // let stdout_layer = tracing_subscriber::fmt::layer() + // .with_thread_names(true) + // .with_filter(stdout_filter); // Exports tracing traces to opentelemetry let tracing_filter = EnvFilter::new("info") @@ -120,7 +120,7 @@ pub async fn init_observability() -> Arc { tracing_subscriber::registry() .with(tokio_console_layer) .with(otel_log_layer) - .with(stdout_layer) + // .with(stdout_layer) .with(tracing_opentelemetry::MetricsLayer::new( meter_provider.clone(), )) From c6c492b971ca7e414f298e315f176d21646ec388 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 15:18:36 +0100 Subject: [PATCH 29/75] Adjust queue size --- src/database/repo_indexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index eea3f55..f42938d 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -30,9 +30,9 @@ pub struct LastIndexedTimestamp { /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; /// The size of the buffer between each pipeline stage in elements -const BUFFER_SIZE: usize = 30; +const BUFFER_SIZE: usize = 100; /// Buffer size multiplier for the download stage -const DOWNLOAD_BUFFER_SIZE: usize = 6; +const DOWNLOAD_BUFFER_SIZE: usize = 1; // Make this less hacky macro_rules! stage { From 8d41b31ffd0f7f17f4a7135853091680ba1f149a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 15:19:44 +0100 Subject: [PATCH 30/75] Set target-cpu to native --- .cargo/config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index bff29e6..8df42a5 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,2 @@ [build] -rustflags = ["--cfg", "tokio_unstable"] +rustflags = ["--cfg", "tokio_unstable", "-C", "target-cpu=native"] From 6d58208085d91e30f63aaa3578843120771212ae Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 15:20:14 +0100 Subject: [PATCH 31/75] Reduce build time for release builds --- Cargo.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index da487a4..02870aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,8 +65,6 @@ sys-info = "0.9.1" sysinfo = "0.33.1" [profile.release] -lto = true +lto = "thin" strip = false opt-level = 3 -panic = 'abort' -codegen-units = 1 From 5b312d4b2fec838b0dfd47fd1aed38524f36f40e Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 16:25:27 +0100 Subject: [PATCH 32/75] Make config global --- src/config.rs | 19 ++++++ src/database/mod.rs | 9 ++- src/database/repo_indexer.rs | 22 +++---- src/database/repo_indexer/index_repo.rs | 2 +- src/main.rs | 16 ++--- src/observability.rs | 86 ++++++++++++++----------- 6 files changed, 96 insertions(+), 58 deletions(-) diff --git a/src/config.rs b/src/config.rs index 166d327..82362cc 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,3 +1,5 @@ +use std::sync::LazyLock; + use clap::{ArgAction, Parser}; use colored::Colorize; use tracing::{info, level_filters::LevelFilter}; @@ -30,8 +32,25 @@ pub struct Args { /// Indexer Mode (jetstream only or full) #[arg(long, default_value = "jetstream")] pub mode: String, + /// Capacity of the surrealdb connection. 0 means unbounded + #[arg(long, default_value = "0")] + pub surrealdb_capacity: usize, + /// Size of the buffer between each pipeline stage in elements + #[arg(long, default_value = "10")] + pub pipeline_buffer_size: usize, + /// Enable tokio console support + #[arg(long, default_value = "false")] + pub console: bool, + /// Enable opentelemetry tracing support + #[arg(long, default_value = "true")] + pub otel_tracing: bool, + /// Enable opentelemetry + #[arg(long, default_value = "true")] + pub otel: bool, } +pub const ARGS: LazyLock = LazyLock::new(|| parse_args()); + impl Args { /// Dump configuration to log pub fn dump(self: &Self) { diff --git a/src/database/mod.rs b/src/database/mod.rs index 45cc810..43c06e5 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -3,6 +3,8 @@ use definitions::{JetstreamCursor, Record}; use surrealdb::{engine::any::Any, opt::auth::Root, RecordId, Surreal}; use tracing::{debug, info}; +use crate::config::ARGS; + pub mod definitions; pub mod handlers; pub mod repo_indexer; @@ -10,13 +12,16 @@ mod utils; /// Connect to the database pub async fn connect( - db_endpoint: String, + db_endpoint: &str, username: &str, password: &str, ) -> anyhow::Result> { // connect to the database info!(target: "indexer", "Connecting to the database at {}", db_endpoint); - let db = surrealdb::engine::any::connect(db_endpoint).await?; + // let db = Surreal::new::<_>(db_endpoint).await?; + let db = surrealdb::engine::any::connect(db_endpoint) + .with_capacity(ARGS.surrealdb_capacity) + .await?; // sign in to the server diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index f42938d..92e4888 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -7,6 +7,8 @@ use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Surreal}; use tracing::{error, warn}; +use crate::config::ARGS; + mod index_repo; mod repo_stream; @@ -29,10 +31,6 @@ pub struct LastIndexedTimestamp { /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; -/// The size of the buffer between each pipeline stage in elements -const BUFFER_SIZE: usize = 100; -/// Buffer size multiplier for the download stage -const DOWNLOAD_BUFFER_SIZE: usize = 1; // Make this less hacky macro_rules! stage { @@ -126,6 +124,8 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { } repos_indexed.add(count as u64, &[]); + let buffer_size = ARGS.pipeline_buffer_size; + RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) .map(|did| async { let db = &db; @@ -141,40 +141,40 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { ); item }) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .map(stage!(tracker, "check_indexed", "get_service", item -> item.check_indexed().await )) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "get_service")) .map(stage!(tracker, "get_service", "download_repo", item -> item.get_service().await )) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "download_repo")) .map(stage!(tracker, "download_repo", "deserialize_repo", item -> item.download_repo().await )) - .buffer_unordered(BUFFER_SIZE * DOWNLOAD_BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "deserialize_repo")) .map( stage!(tracker, "deserialize_repo", "files_to_updates", item -> item.deserialize_repo().await ), ) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "files_to_updates")) .map(stage!(tracker, "files_to_updates", "apply_updates", item -> item.files_to_updates().await )) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "apply_updates")) .map(stage!(tracker, "apply_updates", "print_report", item -> { // println!("Items: {:?}", item.state.updates.len()); item.apply_updates().await} )) - .buffer_unordered(BUFFER_SIZE) + .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "print_report")) .for_each(|x| async { tracker.add( diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index af1af43..5aa51ea 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -226,7 +226,7 @@ async fn apply_updates( }) .collect(); for f in futures.into_iter() { - f.await; + f.await?; } let _: Option = db diff --git a/src/main.rs b/src/main.rs index 36b047a..2848247 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ #![feature(type_changing_struct_update)] use anyhow::Context; -use config::Args; +use config::{Args, ARGS}; use database::repo_indexer::start_full_repo_indexer; use jetstream_consumer::attach_jetstream; use metrics_reporter::export_system_metrics; @@ -27,7 +27,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; /// Entry point for the application fn main() { - let args = config::parse_args(); + let args = ARGS; args.dump(); // build async runtime @@ -60,7 +60,7 @@ fn main() { // launch the application default_provider().install_default().unwrap(); - let err = rt.block_on(application_main(args)); + let err = rt.block_on(application_main()); if let Err(e) = &err { error!(target: "indexer", "{:?}", e); exit(1); @@ -68,22 +68,22 @@ fn main() { } /// Asynchronous main function -async fn application_main(args: Args) -> anyhow::Result<()> { +async fn application_main() -> anyhow::Result<()> { let _otel_guard = init_observability().await; // connect to the database - let db = database::connect(args.db, &args.username, &args.password) + let db = database::connect(&ARGS.db, &ARGS.username, &ARGS.password) .await .context("Failed to connect to the database")?; let metrics_task = tokio::spawn(export_system_metrics()); - let jetstream_task = tokio::spawn(attach_jetstream(db.clone(), args.certificate.clone())); - if args.mode == "full" { + // let jetstream_task = tokio::spawn(attach_jetstream(db.clone(), args.certificate.clone())); + if ARGS.mode == "full" { start_full_repo_indexer(db.clone()).await?; } // TODO: To something smart if one of the tasks exits metrics_task.await??; - jetstream_task.await??; + // jetstream_task.await??; Ok(()) } diff --git a/src/observability.rs b/src/observability.rs index 4dceb6a..1446fb7 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -23,7 +23,11 @@ use std::{ }; use surrealdb::Uuid; use tokio::signal::ctrl_c; -use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer}; +use tracing_subscriber::{ + filter::FilterFn, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer, +}; + +use crate::config::ARGS; const RESOURCE: LazyLock = LazyLock::new(|| { let instance_id = Uuid::new_v4(); @@ -86,46 +90,56 @@ pub async fn init_observability() -> Arc { let meter_provider = init_meter(); let logger_provider = init_logger(); - // Exports logs to otel - let otel_log_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let otel_log_layer = - OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); - // Exports tokio stats for tokio-console - let tokio_console_layer = console_subscriber::spawn(); + let tokio_console_enabled = ARGS.console; + let tokio_console_filter = FilterFn::new(move |_| tokio_console_enabled); + let tokio_console_layer = console_subscriber::spawn().with_filter(tokio_console_filter); // Prints logs to stdout - // let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); - // let stdout_layer = tracing_subscriber::fmt::layer() - // .with_thread_names(true) - // .with_filter(stdout_filter); - - // Exports tracing traces to opentelemetry - let tracing_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let tracer = tracer_provider.tracer("tracing-otel-subscriber"); - let tracing_layer = - tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); + let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + let stdout_layer = tracing_subscriber::fmt::layer() + .with_thread_names(true) + .with_filter(stdout_filter); // Add all layers - tracing_subscriber::registry() - .with(tokio_console_layer) - .with(otel_log_layer) - // .with(stdout_layer) - .with(tracing_opentelemetry::MetricsLayer::new( - meter_provider.clone(), - )) - .with(tracing_layer) - .init(); + let registry = tracing_subscriber::registry() + .with(stdout_layer) + .with(tokio_console_layer); + if ARGS.otel { + // Exports logs to otel + let otel_log_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let otel_log_layer = + OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); + + let registry_with_otel = + registry + .with(otel_log_layer) + .with(tracing_opentelemetry::MetricsLayer::new( + meter_provider.clone(), + )); + if ARGS.otel_tracing { + // Exports tracing traces to opentelemetry + let tracing_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let tracer = tracer_provider.tracer("tracing-otel-subscriber"); + let tracing_layer = + tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); + registry_with_otel.with(tracing_layer).init(); + } else { + registry_with_otel.init(); + }; + } else { + registry.init(); + }; // TODO: Replace this hacky mess with something less broken let guard = Arc::new(OtelGuard { From 36eac37299253e3adc4841630d1ce10ad40ba72f Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 16:25:52 +0100 Subject: [PATCH 33/75] Enable incremental release builds --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 02870aa..b42c830 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,3 +68,4 @@ sysinfo = "0.33.1" lto = "thin" strip = false opt-level = 3 +incremental = true From 454a298593febe118c4819532f601a52e5eec3a5 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 18:42:10 +0100 Subject: [PATCH 34/75] Refactor main --- src/config.rs | 17 +++-- src/database/repo_indexer.rs | 6 +- src/database/repo_indexer/index_repo.rs | 84 ++++++++++++++---------- src/database/repo_indexer/repo_stream.rs | 72 ++++++-------------- src/main.rs | 83 ++++++++++++----------- 5 files changed, 126 insertions(+), 136 deletions(-) diff --git a/src/config.rs b/src/config.rs index 82362cc..47535da 100644 --- a/src/config.rs +++ b/src/config.rs @@ -11,9 +11,9 @@ pub struct Args { /// Certificate to check jetstream server against #[arg(short = 'c', long, default_value = "/etc/ssl/certs/ISRG_Root_X1.pem")] pub certificate: String, - /// Override tokio threadpool size for async operations + /// Set the tokio threadpool size. The default value is the number of cores available to the system. #[arg(long)] - pub worker_threads: Option, + pub threads: Option, /// Override parallel task count for full repo index operations #[arg(long)] pub max_tasks: Option, @@ -29,9 +29,12 @@ pub struct Args { /// Debug verbosity level #[arg(short, action = ArgAction::Count)] pub verbosity: u8, - /// Indexer Mode (jetstream only or full) - #[arg(long, default_value = "jetstream")] - pub mode: String, + /// Enable backfilling of old repos + #[arg(long, default_value = "true")] + pub backfill: bool, + /// Enable attaching to the jetstream for realtime updates + #[arg(long, default_value = "true")] + pub jetstream: bool, /// Capacity of the surrealdb connection. 0 means unbounded #[arg(long, default_value = "0")] pub surrealdb_capacity: usize, @@ -59,8 +62,8 @@ impl Args { info!("{}: {}", "Certificate".cyan(), self.certificate.green()); info!( "{}: {}", - "Worker Threads".cyan(), - self.worker_threads.map_or_else( + "Threads".cyan(), + self.threads.map_or_else( || "Not set, using CPU count".yellow(), |v| v.to_string().green() ) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 92e4888..5fdabaf 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -126,10 +126,10 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let buffer_size = ARGS.pipeline_buffer_size; - RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), &db) + RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) .map(|did| async { - let db = &db; - let http_client = &http_client; + let db = db.clone(); + let http_client = http_client.clone(); let item = PipelineItem::new(db, http_client, did); tracker.add( diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 5aa51ea..bacfd2f 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -296,26 +296,22 @@ pub struct WithUpdates { /// Updates have been applied pub struct Done {} -pub struct PipelineItem<'a, State> { - db: &'a Surreal, - http_client: &'a Client, +pub struct PipelineItem { + db: Surreal, + http_client: Client, did: String, span: Span, pub state: State, } -impl<'a> PipelineItem<'a, New> { - pub fn new( - db: &'a Surreal, - http_client: &'a Client, - did: String, - ) -> PipelineItem<'a, New> { +impl PipelineItem { + pub fn new(db: Surreal, http_client: Client, did: String) -> PipelineItem { let span = span!(target: "backfill", parent: None, Level::INFO, "pipeline_item"); span.record("did", did.clone()); span.in_scope(|| { trace!("Start backfilling repo"); }); - PipelineItem::<'a, New> { + PipelineItem:: { db, http_client, did, @@ -325,95 +321,113 @@ impl<'a> PipelineItem<'a, New> { } } -impl<'a> PipelineItem<'a, New> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn check_indexed(self) -> anyhow::Result> { + pub async fn check_indexed(self) -> anyhow::Result> { if check_indexed(&self.db, &self.did).await? { // TODO: Handle this better, as this is not really an error return Err(anyhow::anyhow!("Already indexed")); } - Ok(PipelineItem::<'a, NotIndexed> { + Ok(PipelineItem:: { state: NotIndexed {}, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, NotIndexed> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn get_service(self) -> anyhow::Result> { + pub async fn get_service(self) -> anyhow::Result> { let service = get_plc_service(&self.http_client, &self.did).await?; let Some(service) = service else { // TODO: Handle this better, as this is not really an error return Err(anyhow::anyhow!("Failed to get a plc service")); }; - Ok(PipelineItem::<'a, WithService> { + Ok(PipelineItem:: { state: WithService { service: service, now: std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap(), }, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, WithService> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn download_repo(self) -> anyhow::Result> { + pub async fn download_repo(self) -> anyhow::Result> { let repo = download_repo(&self.state.service, &self.did).await?; - Ok(PipelineItem::<'a, WithRepo> { + Ok(PipelineItem:: { state: WithRepo { now: self.state.now, repo, }, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, WithRepo> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn deserialize_repo(self) -> anyhow::Result> { + pub async fn deserialize_repo(self) -> anyhow::Result> { info!("Deserializing repo {}", self.did); let files = deserialize_repo(self.state.repo).await?; - Ok(PipelineItem::<'a, WithFiles> { + Ok(PipelineItem:: { state: WithFiles { now: self.state.now, files, }, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, WithFiles> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn files_to_updates(self) -> anyhow::Result> { + pub async fn files_to_updates(self) -> anyhow::Result> { let updates = files_to_updates(self.state.files).await?; - Ok(PipelineItem::<'a, WithUpdates> { + Ok(PipelineItem:: { state: WithUpdates { now: self.state.now, updates, }, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, WithUpdates> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn apply_updates(self) -> anyhow::Result> { + pub async fn apply_updates(self) -> anyhow::Result> { apply_updates(&self.db, &self.did, self.state.updates, &self.state.now).await?; - Ok(PipelineItem::<'a, Done> { + Ok(PipelineItem:: { state: Done {}, - ..self + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, }) } } -impl<'a> PipelineItem<'a, Done> { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn print_report(self) -> () { // TODO: This is only for printing debug stuff diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index 45b400c..d3ff15f 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -10,25 +10,20 @@ use tracing::info; use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; -pub struct RepoStream<'a> { +pub struct RepoStream { buffer: VecDeque, processed_dids: HashSet, anchor: String, - db: &'a Surreal, + db: Surreal, db_future: Option< std::pin::Pin< - Box< - dyn Future> - + Send - + Sync - + 'a, - >, + Box> + Send>, >, >, } -impl<'a> RepoStream<'a> { - pub fn new(anchor: String, db: &'a Surreal) -> Self { +impl RepoStream { + pub fn new(anchor: String, db: Surreal) -> Self { return Self { buffer: VecDeque::new(), processed_dids: HashSet::new(), @@ -39,7 +34,7 @@ impl<'a> RepoStream<'a> { } } -const FETCH_AMOUNT: usize = 100; +const FETCH_AMOUNT: usize = 10000; // async fn get_repos_from(db: &Surreal, anchor: &str) -> Vec { // info!(target: "indexer", "Discovering follows starting from {}", anchor); @@ -57,7 +52,7 @@ const FETCH_AMOUNT: usize = 100; // }; // } -impl<'a> Stream for RepoStream<'a> { +impl Stream for RepoStream { type Item = String; fn poll_next( @@ -70,28 +65,19 @@ impl<'a> Stream for RepoStream<'a> { } info!(target: "indexer", "Discovering follows starting from {}", self.anchor); - let db_future = if self.db_future.is_some() { - self.db_future.as_mut().unwrap() - } else { - let result = self - .db - // TODO: Fix the possible SQL injection - .query(format!( - "SELECT id,in,out FROM follow:{}.. LIMIT {};", - self.anchor, FETCH_AMOUNT - )); - // let mut future: std::pin::Pin< - // Box< - // dyn Future> - // + Send - // + Sync - // + 'a, - // >, - // > - let future = result.into_future(); - self.db_future = Some(future); - self.db_future.as_mut().unwrap() - }; + if self.db_future.is_none() { + self.db_future = Some( + self.db + // TODO: Fix the possible SQL injection + .query(format!( + "SELECT id,in,out FROM follow:{}.. LIMIT {};", + self.anchor, FETCH_AMOUNT + )) + .into_owned() + .into_future(), + ); + } + let db_future = self.db_future.as_mut().unwrap(); let Poll::Ready(result) = Future::poll(db_future.as_mut(), cx) else { return Poll::Pending; @@ -100,18 +86,9 @@ impl<'a> Stream for RepoStream<'a> { let mut result = result.unwrap(); - // let mut result: surrealdb::method::Query<'_, Any> = self - // .db - // // TODO: Fix the possible SQL injection - // .query(format!( - // "SELECT id,in,out FROM follow:{}.. LIMIT {};", - // self.anchor, FETCH_AMOUNT - // )); let follows: Vec = result.take(0).unwrap(); let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { - // sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; - // continue; // TODO: Sleep again return Poll::Pending; }; @@ -125,9 +102,6 @@ impl<'a> Stream for RepoStream<'a> { } self.processed_dids.insert(did.clone()); self.buffer.push_back(did); - // tx.send(did) - // .await - // .context("Failed to send message to handler thread")?; } } @@ -135,12 +109,6 @@ impl<'a> Stream for RepoStream<'a> { return Poll::Ready(Some(next)); } return Poll::Pending; - - // Warn if it looks like the queue size or the backoff were choosen incorrectly - // let new_follows = self.processed_dids.len() - processed_dids_before; - // if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { - // warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); - // } } } } diff --git a/src/main.rs b/src/main.rs index 2848247..37948d3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,16 @@ -#![feature(type_changing_struct_update)] - use anyhow::Context; use config::{Args, ARGS}; use database::repo_indexer::start_full_repo_indexer; +use futures::{stream::FuturesUnordered, FutureExt, StreamExt}; use jetstream_consumer::attach_jetstream; use metrics_reporter::export_system_metrics; use observability::init_observability; use std::{ + future::Future, + pin::Pin, process::exit, sync::atomic::{AtomicUsize, Ordering}, + time::Duration, }; use tokio::runtime::Builder; use tokio_rustls::rustls::crypto::aws_lc_rs::default_provider; @@ -27,43 +29,32 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; /// Entry point for the application fn main() { - let args = ARGS; - args.dump(); - // build async runtime - let rt = if let Some(threads) = args.worker_threads { - Builder::new_multi_thread() - .enable_all() - .worker_threads(threads) - .max_blocking_threads(512 * 512) - .max_io_events_per_tick(1024 * 512) - .thread_name_fn(|| { - static ATOMIC: AtomicUsize = AtomicUsize::new(0); - let id = ATOMIC.fetch_add(1, Ordering::Relaxed); - format!("Thread {}", id) - }) - .build() - .unwrap() - } else { - Builder::new_multi_thread() - .enable_all() - .max_blocking_threads(512 * 512) - .max_io_events_per_tick(1024 * 512) - .thread_name_fn(|| { - static ATOMIC: AtomicUsize = AtomicUsize::new(0); - let id = ATOMIC.fetch_add(1, Ordering::Relaxed); - format!("Thread {}", id) - }) - .build() - .unwrap() - }; + let mut rt_builder = Builder::new_multi_thread(); + rt_builder + .enable_all() + .max_blocking_threads(512 * 512) + .max_io_events_per_tick(1024 * 512) + .thread_name_fn(|| { + static ATOMIC: AtomicUsize = AtomicUsize::new(0); + let id = ATOMIC.fetch_add(1, Ordering::Relaxed); + format!("Thread {}", id) + }); + if let Some(threads) = ARGS.threads { + rt_builder.worker_threads(threads); + } + let rt = rt_builder.build().unwrap(); // launch the application default_provider().install_default().unwrap(); let err = rt.block_on(application_main()); + rt.shutdown_timeout(Duration::from_secs(5)); if let Err(e) = &err { error!(target: "indexer", "{:?}", e); exit(1); + } else { + eprintln!("A task exited successfully, shutting down"); + exit(0); } } @@ -76,14 +67,28 @@ async fn application_main() -> anyhow::Result<()> { .await .context("Failed to connect to the database")?; - let metrics_task = tokio::spawn(export_system_metrics()); - // let jetstream_task = tokio::spawn(attach_jetstream(db.clone(), args.certificate.clone())); - if ARGS.mode == "full" { - start_full_repo_indexer(db.clone()).await?; + // Create tasks + let metrics_task = export_system_metrics().boxed(); + let jetstream_task = attach_jetstream((&db).to_owned(), ARGS.certificate.clone()).boxed(); + let indexer_task = start_full_repo_indexer((&db).to_owned()).boxed_local(); + + // Add all tasks to a list + let mut tasks: FuturesUnordered>>>> = + FuturesUnordered::new(); + tasks.push(metrics_task); + if ARGS.jetstream { + tasks.push(jetstream_task); + } + if ARGS.backfill { + tasks.push(indexer_task); } - // TODO: To something smart if one of the tasks exits - metrics_task.await??; - // jetstream_task.await??; - Ok(()) + // Wait for the first task to exit + let first_exited_task = tasks.next().await; + let Some(task_result) = first_exited_task else { + return Err(anyhow::anyhow!( + "It seems like there were no tasks. This should never happen." + )); + }; + task_result } From 396e6ab6e5f05cd96c6b2b35aab38bfcf1cc8df3 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 20:51:07 +0100 Subject: [PATCH 35/75] Remove threads from jetstream consumer --- src/jetstream_consumer.rs | 49 ++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/jetstream_consumer.rs b/src/jetstream_consumer.rs index 253eaab..d430a9d 100644 --- a/src/jetstream_consumer.rs +++ b/src/jetstream_consumer.rs @@ -1,38 +1,29 @@ use anyhow::Context; +use futures::{stream::FuturesUnordered, StreamExt}; use surrealdb::{engine::any::Any, Surreal}; -use tokio::runtime::Builder; use crate::{database, websocket}; +const JETSTREAM_HOSTS: [&str; 5] = [ + "jetstream1.us-west.bsky.network", + "jetstream2.us-east.bsky.network", + "test-jetstream.skyfeed.moe", + "jetstream2.us-west.bsky.network", + "jetstream1.us-east.bsky.network", +]; + pub async fn attach_jetstream(db: Surreal, certificate: String) -> anyhow::Result<()> { - let jetstream_hosts = vec![ - "jetstream1.us-west.bsky.network", - "jetstream2.us-east.bsky.network", - "test-jetstream.skyfeed.moe", - "jetstream2.us-west.bsky.network", - "jetstream1.us-east.bsky.network", - ]; - - for host in jetstream_hosts { - let db_clone = db.clone(); - let certificate = certificate.clone(); - let (name, _) = host.split_at(18); - std::thread::Builder::new() - .name(format!("{}", name)) - .spawn(move || { - Builder::new_current_thread() - .enable_io() - .enable_time() - .build() - .unwrap() - .block_on(async { - start_jetstream_consumer(db_clone, host.to_string(), certificate) - .await - .context("jetstream consumer failed") - .unwrap(); - }); - }) - .context("Failed to spawn jetstream consumer thread")?; + let mut jetstream_tasks = JETSTREAM_HOSTS + .iter() + .map(|host| start_jetstream_consumer(db.clone(), host.to_string(), certificate.clone())) + .collect::>(); + + loop { + let result = jetstream_tasks.next().await; + let Some(result) = result else { + break; + }; + result?; } Ok(()) From af79ce8d40b8abad597648a8605c45828e5e5b75 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 21:01:57 +0100 Subject: [PATCH 36/75] Switch to rustls --- Cargo.toml | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b42c830..5b6da92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ tokio = { version = "1.43.0", features = [ ] } tokio-rustls = "0.26.1" tokio-util = { version = "0.7.13", features = ["io"] } -rsky-pds = { git = "https://github.com/blacksky-algorithms/rsky.git" } fastwebsockets = { version = "0.10.0", features = ["upgrade"] } atrium-api = { version = "0.25.0", default-features = false, features = [ "namespace-appbsky", @@ -26,19 +25,27 @@ atrium-api = { version = "0.25.0", default-features = false, features = [ serde = { version = "1.0.218", features = ["derive"] } simd-json = "0.14.3" num_cpus = "1.16.0" -# log = "0.4.22" clap = { version = "4.5.31", features = ["derive"] } colog = "1.3.0" colored = "3.0.0" chrono = "0.4.39" mimalloc = "0.1.43" -surrealdb = { version = "2.2.1", features = ["kv-mem", "kv-rocksdb"] } -surrealdb-tikv-client = "0.3.0-surreal.1" +surrealdb = { version = "2.2.1", features = ["kv-mem", "kv-rocksdb", "rustls"] } +# surrealdb-tikv-client = "0.3.0-surreal.1" regex = "1.11.1" lazy_static = "1.5.0" ipld-core = "0.4.2" -atrium-xrpc-client = "0.5.11" -reqwest = { version = "0.12.12", features = ["json", "stream"] } +atrium-xrpc-client = { version = "0.5.11", default-features = false, features = [ + "reqwest", +] } +reqwest = { version = "0.12.12", default-features = false, features = [ + "charset", + "http2", + "macos-system-configuration", + "json", + "stream", + "rustls-tls", +] } iroh-car = "0.5.1" futures = "0.3.31" serde_ipld_dagcbor = "0.6.2" @@ -48,7 +55,13 @@ console-subscriber = "0.4.1" opentelemetry = { version = "0.28.0", features = ["metrics"] } opentelemetry_sdk = { version = "0.28.0", features = ["metrics", "rt-tokio"] } opentelemetry-stdout = { version = "0.28.0", features = ["metrics", "trace"] } -opentelemetry-otlp = { version = "0.28.0", features = [ +opentelemetry-otlp = { version = "0.28.0", default-features = false, features = [ + "http-proto", + "trace", + "metrics", + "logs", + "internal-logs", + "reqwest-rustls", "grpc-tonic", "metrics", ] } From c945aa0d184b79c08716ea11683f27434bc5d63f Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 21:10:19 +0100 Subject: [PATCH 37/75] Adjust boolean CLI flags --- src/config.rs | 106 ++++++++++++++++++++----------------------- src/main.rs | 12 +++-- src/observability.rs | 6 +-- 3 files changed, 59 insertions(+), 65 deletions(-) diff --git a/src/config.rs b/src/config.rs index 47535da..d809c7c 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,8 +1,5 @@ -use std::sync::LazyLock; - use clap::{ArgAction, Parser}; -use colored::Colorize; -use tracing::{info, level_filters::LevelFilter}; +use std::sync::LazyLock; /// Command line arguments #[derive(Parser, Debug)] @@ -30,11 +27,11 @@ pub struct Args { #[arg(short, action = ArgAction::Count)] pub verbosity: u8, /// Enable backfilling of old repos - #[arg(long, default_value = "true")] - pub backfill: bool, + #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] + pub backfill: Option, /// Enable attaching to the jetstream for realtime updates - #[arg(long, default_value = "true")] - pub jetstream: bool, + #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] + pub jetstream: Option, /// Capacity of the surrealdb connection. 0 means unbounded #[arg(long, default_value = "0")] pub surrealdb_capacity: usize, @@ -42,58 +39,53 @@ pub struct Args { #[arg(long, default_value = "10")] pub pipeline_buffer_size: usize, /// Enable tokio console support - #[arg(long, default_value = "false")] - pub console: bool, + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub console: Option, /// Enable opentelemetry tracing support - #[arg(long, default_value = "true")] - pub otel_tracing: bool, + #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] + pub otel_tracing: Option, /// Enable opentelemetry - #[arg(long, default_value = "true")] - pub otel: bool, + #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] + pub otel: Option, } -pub const ARGS: LazyLock = LazyLock::new(|| parse_args()); +pub const ARGS: LazyLock = LazyLock::new(|| Args::parse()); -impl Args { - /// Dump configuration to log - pub fn dump(self: &Self) { - // dump configuration - info!("{}", "Configuration:".bold().underline().blue()); - info!("{}: {}", "Certificate".cyan(), self.certificate.green()); - info!( - "{}: {}", - "Threads".cyan(), - self.threads.map_or_else( - || "Not set, using CPU count".yellow(), - |v| v.to_string().green() - ) - ); - info!( - "{}: {}", - "Max tasks".cyan(), - self.max_tasks.map_or_else( - || "Not set, using CPU count times 32".yellow(), - |v| v.to_string().green() - ) - ); - info!( - "{}: {}", - "Verbosity Level".cyan(), - self.log_level().to_string().green() - ); - } +// impl Args { +// /// Dump configuration to log +// pub fn dump(self: &Self) { +// // dump configuration +// info!("{}", "Configuration:".bold().underline().blue()); +// info!("{}: {}", "Certificate".cyan(), self.certificate.green()); +// info!( +// "{}: {}", +// "Threads".cyan(), +// self.threads.map_or_else( +// || "Not set, using CPU count".yellow(), +// |v| v.to_string().green() +// ) +// ); +// info!( +// "{}: {}", +// "Max tasks".cyan(), +// self.max_tasks.map_or_else( +// || "Not set, using CPU count times 32".yellow(), +// |v| v.to_string().green() +// ) +// ); +// info!( +// "{}: {}", +// "Verbosity Level".cyan(), +// self.log_level().to_string().green() +// ); +// } - /// Verbosity to log level - pub fn log_level(self: &Self) -> LevelFilter { - match self.verbosity { - 0 => LevelFilter::INFO, - 1 => LevelFilter::DEBUG, - _ => LevelFilter::TRACE, - } - } -} - -/// Parse command line arguments -pub fn parse_args() -> Args { - Args::parse() -} +// /// Verbosity to log level +// pub fn log_level(self: &Self) -> LevelFilter { +// match self.verbosity { +// 0 => LevelFilter::INFO, +// 1 => LevelFilter::DEBUG, +// _ => LevelFilter::TRACE, +// } +// } +// } diff --git a/src/main.rs b/src/main.rs index 37948d3..cb0f2cf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use anyhow::Context; -use config::{Args, ARGS}; +use config::ARGS; use database::repo_indexer::start_full_repo_indexer; use futures::{stream::FuturesUnordered, FutureExt, StreamExt}; use jetstream_consumer::attach_jetstream; @@ -29,11 +29,13 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; /// Entry point for the application fn main() { - // build async runtime + // Build async runtime let mut rt_builder = Builder::new_multi_thread(); rt_builder .enable_all() .max_blocking_threads(512 * 512) + .enable_time() + .enable_io() .max_io_events_per_tick(1024 * 512) .thread_name_fn(|| { static ATOMIC: AtomicUsize = AtomicUsize::new(0); @@ -45,7 +47,7 @@ fn main() { } let rt = rt_builder.build().unwrap(); - // launch the application + // Launch the async main function default_provider().install_default().unwrap(); let err = rt.block_on(application_main()); rt.shutdown_timeout(Duration::from_secs(5)); @@ -76,10 +78,10 @@ async fn application_main() -> anyhow::Result<()> { let mut tasks: FuturesUnordered>>>> = FuturesUnordered::new(); tasks.push(metrics_task); - if ARGS.jetstream { + if ARGS.jetstream.unwrap_or(true) { tasks.push(jetstream_task); } - if ARGS.backfill { + if ARGS.backfill.unwrap_or(true) { tasks.push(indexer_task); } diff --git a/src/observability.rs b/src/observability.rs index 1446fb7..20b296e 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -91,7 +91,7 @@ pub async fn init_observability() -> Arc { let logger_provider = init_logger(); // Exports tokio stats for tokio-console - let tokio_console_enabled = ARGS.console; + let tokio_console_enabled = ARGS.console.unwrap_or(false); let tokio_console_filter = FilterFn::new(move |_| tokio_console_enabled); let tokio_console_layer = console_subscriber::spawn().with_filter(tokio_console_filter); @@ -105,7 +105,7 @@ pub async fn init_observability() -> Arc { let registry = tracing_subscriber::registry() .with(stdout_layer) .with(tokio_console_layer); - if ARGS.otel { + if ARGS.otel.unwrap_or(true) { // Exports logs to otel let otel_log_filter = EnvFilter::new("info") .add_directive("hyper=off".parse().unwrap()) @@ -122,7 +122,7 @@ pub async fn init_observability() -> Arc { .with(tracing_opentelemetry::MetricsLayer::new( meter_provider.clone(), )); - if ARGS.otel_tracing { + if ARGS.otel_tracing.unwrap_or(true) { // Exports tracing traces to opentelemetry let tracing_filter = EnvFilter::new("info") .add_directive("hyper=off".parse().unwrap()) From 3873471400d23c0fbdb9a49c3cc5f67e9c61518b Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 22:37:49 +0100 Subject: [PATCH 38/75] Parallelize database writes --- src/database/repo_indexer/index_repo.rs | 41 ++++++++++++------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index bacfd2f..d04b02b 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -4,7 +4,7 @@ use atrium_api::{ record::KnownRecord, types::string::{Did, RecordKey}, }; -use futures::{stream::FuturesUnordered, TryStreamExt}; +use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt}; use hyper::body::Bytes; use ipld_core::cid::{Cid, CidGeneric}; use iroh_car::CarReader; @@ -202,35 +202,34 @@ async fn apply_updates( ) -> anyhow::Result<()> { let did_key = crate::database::utils::did_to_key(did)?; - let futures: Vec<_> = updates + let mut futures: FuturesUnordered<_> = updates .into_iter() - .map(|update| { + .map(|update| async { let db = db.clone(); let did_key = did_key.clone(); let did = did.to_string(); - tokio::spawn(async move { - let res = on_commit_event_createorupdate( - &db, - Did::new(did.clone().into()).unwrap(), - did_key, - update.collection, - update.rkey, - update.record, - ) - .await; - - if let Err(error) = res { - warn!("on_commit_event_createorupdate {} {}", error, did); - } - }) + + let res = on_commit_event_createorupdate( + &db, + Did::new(did.clone().into()).unwrap(), + did_key, + update.collection, + update.rkey, + update.record, + ) + .await; + + if let Err(error) = res { + warn!("on_commit_event_createorupdate {} {}", error, did); + } }) .collect(); - for f in futures.into_iter() { - f.await?; + loop { + let Some(_) = futures.next().await else { break }; } let _: Option = db - .upsert(("li_did", did_key)) + .upsert(("li_did", did_key.clone())) .content(LastIndexedTimestamp { time_us: update_timestamp.as_micros() as u64, time_dt: chrono::Utc::now().into(), From 9cdf60637ac1cac76c561324f66e77b595e5e58c Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 22:40:34 +0100 Subject: [PATCH 39/75] Add samply to the nix flake --- flake.lock | 24 ++++++++++++------------ flake.nix | 3 ++- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/flake.lock b/flake.lock index 125457d..3352cf7 100644 --- a/flake.lock +++ b/flake.lock @@ -8,11 +8,11 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1728628307, - "narHash": "sha256-GRMRHZyU+R0RqKPFFgi7BBMDIRFPnHaAhOIxlqyvbZQ=", + "lastModified": 1740551776, + "narHash": "sha256-CkcCb2hGSL1owuZpjuNB6UQzlyaXgvuRXmjY6jLqjPc=", "owner": "nix-community", "repo": "fenix", - "rev": "b0a014d5b9dba793ebc205bcf12a93b5f6a4c66c", + "rev": "07a730bc80e8a4106df5b2341aa5602a240ee112", "type": "github" }, "original": { @@ -26,11 +26,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1726560853, - "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=", + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", "owner": "numtide", "repo": "flake-utils", - "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", "type": "github" }, "original": { @@ -41,11 +41,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1728666542, - "narHash": "sha256-8GQzboDFRSMAHUeE5DKJEWXZejG1Iq1PDzrXZXwG6Zc=", + "lastModified": 1740607981, + "narHash": "sha256-2PCuxqf4lYpAgemBgxfhIBGaVKqNpHIvt9idkHrOrTk=", "owner": "nixos", "repo": "nixpkgs", - "rev": "659f95ed9edc2cf696e21d89abcde80fde979a0f", + "rev": "a7281e4a3a171fb34f1ac7c58431d17ee3340931", "type": "github" }, "original": { @@ -64,11 +64,11 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1728505432, - "narHash": "sha256-QFPMazeiGLo7AGy4RREmTgko0Quch/toMVKhGUjDEeo=", + "lastModified": 1740470570, + "narHash": "sha256-iYjLS4jy/IJScOHZfbQ31vkrS5tl0bjXtzYP4XKA2Pc=", "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "0fb804acb375b02a3beeaceeb75b71969ef37b15", + "rev": "89255449982473939a4f8ee954512d339225c182", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 38f6f4b..bddf39b 100644 --- a/flake.nix +++ b/flake.nix @@ -60,6 +60,7 @@ pkgs.pkg-config pkgs.clang + pkgs.samply pkgs.tokio-console ]; @@ -68,7 +69,7 @@ mkdir -p $DATABASE_DIR export DATABASE_DIR - echo 'cargo run -- --db "rocksdb://'$DATABASE_DIR'" --mode full -c '$(pwd)'/ISRG_Root_X1.pem' + echo 'cargo run -- --db "rocksdb://'$DATABASE_DIR'" -c '$(pwd)'/ISRG_Root_X1.pem' ''; }; From 18de229e9d344a4f1f8b5e884c08f29272d313c8 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 2 Mar 2025 22:40:45 +0100 Subject: [PATCH 40/75] Improve server setup script --- setup.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.sh b/setup.sh index e1bad1b..da439c8 100644 --- a/setup.sh +++ b/setup.sh @@ -12,9 +12,16 @@ if ! git status; then git clone https://github.com/zebreus/indexer-rust cd indexer-rust fi +cargo install samply +echo '1' >/proc/sys/kernel/perf_event_paranoid +echo '-1' >/proc/sys/kernel/perf_event_paranoid + +wget https://github.com/zebreus/upload/releases/download/v0.2/upload.binary +chmod +x upload.binary +mv upload.binary /usr/local/bin/upload export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291" echo 'export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291"' >~/.bashrc -cargo build -echo 'Done! Run `./target/debug/indexer-rust --db "rocksdb:///root/rocks/db" --mode full` to start the indexer.' +cargo build --release +echo 'Done! Run `samply record ./target/release/indexer --db "rocksdb:///root/rocks/db" --mode full` to start the indexer.' From d0e1cfb65f3192438857c492ca70cddd9662b34a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Mon, 3 Mar 2025 21:14:09 +0100 Subject: [PATCH 41/75] Remove service instance from metrics --- src/observability.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/observability.rs b/src/observability.rs index 20b296e..0296194 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -14,14 +14,13 @@ use opentelemetry_sdk::{ }; use opentelemetry_semantic_conventions::{ attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, - resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION, SERVICE_INSTANCE_ID}, + resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION}, SCHEMA_URL, }; use std::{ process::exit, sync::{Arc, LazyLock}, }; -use surrealdb::Uuid; use tokio::signal::ctrl_c; use tracing_subscriber::{ filter::FilterFn, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer, @@ -30,12 +29,12 @@ use tracing_subscriber::{ use crate::config::ARGS; const RESOURCE: LazyLock = LazyLock::new(|| { - let instance_id = Uuid::new_v4(); + // let instance_id = Uuid::new_v4(); let mut attributes = vec![ KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), + // KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), ]; From 087125f496d8251d6409df01ee4988c8fa88b9bb Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 4 Mar 2025 14:45:45 +0100 Subject: [PATCH 42/75] Work on performance --- Cargo.toml | 3 +- src/config.rs | 6 + src/database/definitions.rs | 4 + src/database/handlers.rs | 788 +++++++++++++++++------ src/database/mod.rs | 24 +- src/database/repo_indexer.rs | 82 ++- src/database/repo_indexer/index_repo.rs | 149 +++-- src/database/repo_indexer/repo_stream.rs | 53 +- src/main.rs | 2 + 9 files changed, 782 insertions(+), 329 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5b6da92..1f85a94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ colog = "1.3.0" colored = "3.0.0" chrono = "0.4.39" mimalloc = "0.1.43" -surrealdb = { version = "2.2.1", features = ["kv-mem", "kv-rocksdb", "rustls"] } +surrealdb = { version = "2.2.1", features = ["rustls"] } # surrealdb-tikv-client = "0.3.0-surreal.1" regex = "1.11.1" lazy_static = "1.5.0" @@ -76,6 +76,7 @@ tracing-opentelemetry = "0.29.0" opentelemetry-resource-detectors = "0.7.0" sys-info = "0.9.1" sysinfo = "0.33.1" +rs-car-sync = "0.4.1" [profile.release] lto = "thin" diff --git a/src/config.rs b/src/config.rs index d809c7c..11dc1d0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -38,6 +38,9 @@ pub struct Args { /// Size of the buffer between each pipeline stage in elements #[arg(long, default_value = "10")] pub pipeline_buffer_size: usize, + /// Multiply the size of the download buffer by this factor + #[arg(long, default_value = "5")] + pub download_buffer_multiplier: usize, /// Enable tokio console support #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] pub console: Option, @@ -47,6 +50,9 @@ pub struct Args { /// Enable opentelemetry #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] pub otel: Option, + /// Dont write to the database when backfilling + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub dont_write_when_backfilling: Option, } pub const ARGS: LazyLock = LazyLock::new(|| Args::parse()); diff --git a/src/database/definitions.rs b/src/database/definitions.rs index d4f1292..bd479c7 100644 --- a/src/database/definitions.rs +++ b/src/database/definitions.rs @@ -302,6 +302,10 @@ SELECT FROM follow GROUP BY out ; + +DEFINE TABLE latest_backfill SCHEMAFULL; +DEFINE FIELD of ON TABLE latest_backfill TYPE record; +DEFINE FIELD at ON TABLE latest_backfill TYPE option; ", // record ) .await?; diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 4b66d6f..738ebad 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -1,5 +1,6 @@ use anyhow::Result; use atrium_api::app::bsky::richtext::facet::MainFeaturesItem; +use atrium_api::types::Object; use atrium_api::{ app::bsky::embed::video, record::KnownRecord, @@ -9,7 +10,11 @@ use atrium_api::{ }, }; use chrono::Utc; +use serde::{Deserialize, Serialize}; use std::future::IntoFuture; +use std::time::Instant; +use surrealdb::method::Query; +use surrealdb::Datetime; use surrealdb::{engine::any::Any, RecordId, Surreal}; use tracing::{instrument, span, warn, Instrument, Level}; @@ -43,8 +48,9 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { record, cid, } => { - on_commit_event_createorupdate(db, did, did_key, collection, rkey, record) - .await? + let big_update = + on_commit_event_createorupdate(did, did_key, collection, rkey, record)?; + apply_big_update(db, big_update).await?; } Commit::Delete { rev, @@ -92,23 +98,301 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { Ok(()) } +#[derive(Serialize)] +struct UpdateFollow { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateLike { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateRepost { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateBlock { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateListBlock { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateListItem { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateLatestBackfill { + pub of: surrealdb::RecordId, + pub id: String, +} + +/// Database struct for a bluesky profile +#[derive(Debug, Serialize)] +#[allow(dead_code)] +pub struct UpdateDid { + pub id: String, + #[serde(rename = "displayName")] + pub display_name: Option, + pub description: Option, + pub avatar: Option, + pub banner: Option, + #[serde(rename = "createdAt")] + pub created_at: Option, + #[serde(rename = "seenAt")] + pub seen_at: Datetime, + #[serde(rename = "joinedViaStarterPack")] + pub joined_via_starter_pack: Option, + pub labels: Option>, + #[serde(rename = "pinnedPost")] + pub pinned_post: Option, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Serialize)] +pub struct UpdateFeed { + pub id: String, + pub uri: String, + pub author: RecordId, + pub rkey: String, + pub did: String, + #[serde(rename = "displayName")] + pub display_name: String, + pub description: Option, + pub avatar: Option, + #[serde(rename = "createdAt")] + pub created_at: Datetime, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Debug, Serialize)] +pub struct UpdateList { + pub id: String, + pub name: String, + pub purpose: String, + #[serde(rename = "createdAt")] + pub created_at: Datetime, + pub description: Option, + pub avatar: Option, + pub labels: Option>, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Serialize)] +struct UpdateQuote { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdateRepliesRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdateReplyToRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdatePostsRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct WithId { + id: String, + #[serde(flatten)] + data: R, +} + +#[derive(Default)] +pub struct BigUpdate { + /// Insert into did + did: Vec, + follows: Vec, + latest_backfills: Vec, + likes: Vec, + reposts: Vec, + blocks: Vec, + listblocks: Vec, + listitems: Vec, + feeds: Vec, + lists: Vec, + threadgates: Vec>>>, + starterpacks: Vec>>>, + postgates: Vec>>>, + actordeclarations: + Vec>>>, + labelerservices: Vec>>>, + quotes: Vec, + posts: Vec>, + replies_relations: Vec, + reply_to_relations: Vec, + posts_relations: Vec, +} +impl BigUpdate { + pub fn merge(&mut self, other: BigUpdate) { + self.did.extend(other.did); + self.follows.extend(other.follows); + self.latest_backfills.extend(other.latest_backfills); + self.likes.extend(other.likes); + self.reposts.extend(other.reposts); + self.blocks.extend(other.blocks); + self.listblocks.extend(other.listblocks); + self.listitems.extend(other.listitems); + self.feeds.extend(other.feeds); + self.lists.extend(other.lists); + self.threadgates.extend(other.threadgates); + self.starterpacks.extend(other.starterpacks); + self.postgates.extend(other.postgates); + self.actordeclarations.extend(other.actordeclarations); + self.labelerservices.extend(other.labelerservices); + self.quotes.extend(other.quotes); + self.posts.extend(other.posts); + self.replies_relations.extend(other.replies_relations); + self.reply_to_relations.extend(other.reply_to_relations); + self.posts_relations.extend(other.posts_relations); + } +} + +pub async fn apply_big_update(db: &Surreal, big_update: BigUpdate) -> Result<()> { + //TODO: Bundle this into a function + let query_string = r#" + INSERT IGNORE INTO did $dids; + INSERT RELATION INTO follow $follows; + INSERT IGNORE INTO latest_backfill $latest_backfills; + INSERT RELATION INTO like $likes; + INSERT RELATION INTO repost $reposts; + INSERT RELATION INTO block $blocks; + INSERT RELATION INTO listblock $listblocks; + INSERT RELATION INTO listitem $listitems; + INSERT IGNORE INTO feed $feeds; + INSERT IGNORE INTO list $lists; + INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates; + INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks; + INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates; + INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations; + INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices; + INSERT RELATION INTO quote $quotes; + INSERT IGNORE INTO posts $posts; + INSERT RELATION INTO replyto $reply_to_relations; + INSERT RELATION INTO quotes $quotes; + INSERT RELATION INTO replies $replies_relations; + "#; + + let start = Instant::now(); + db.query(query_string) + .bind(("dids", big_update.did)) + .bind(("follows", big_update.follows)) + .bind(("latest_backfills", big_update.latest_backfills)) + .bind(("likes", big_update.likes)) + .bind(("reposts", big_update.reposts)) + .bind(("blocks", big_update.blocks)) + .bind(("listblocks", big_update.listblocks)) + .bind(("listitems", big_update.listitems)) + .bind(("feeds", big_update.feeds)) + .bind(("lists", big_update.lists)) + .bind(("threadgates", big_update.threadgates)) + .bind(("starterpacks", big_update.starterpacks)) + .bind(("postgates", big_update.postgates)) + .bind(("actordeclarations", big_update.actordeclarations)) + .bind(("labelerservices", big_update.labelerservices)) + .bind(("quotes", big_update.quotes)) + .bind(("posts", big_update.posts)) + .bind(("replies_relations", big_update.replies_relations)) + .bind(("reply_to_relations", big_update.reply_to_relations)) + .bind(("posts_relations", big_update.posts_relations)) + .into_future() + .instrument(span!(Level::INFO, "query")) + .await?; + let duration = start.elapsed(); + warn!("Big update took {:?}", duration); + + Ok(()) +} + /// If the new commit is a create or update, handle it -#[instrument(skip(db, record))] -pub async fn on_commit_event_createorupdate( - db: &Surreal, +#[instrument(skip(record))] +pub fn on_commit_event_createorupdate( did: Did, did_key: String, collection: String, rkey: RecordKey, record: KnownRecord, -) -> Result<()> { +) -> Result { utils::ensure_valid_rkey(rkey.to_string())?; + + let mut big_update = BigUpdate::default(); + match record { KnownRecord::AppBskyActorProfile(d) => { // NOTE: using .ok() here isn't optimal, incorrect data should // probably not be entered into the database at all, but for now // we'll just ignore it. - let profile = BskyProfile { + let profile = UpdateDid { + id: did_key.clone(), display_name: d.display_name.clone(), description: d.description.clone(), avatar: None, // TODO Implement @@ -133,13 +417,14 @@ pub async fn on_commit_event_createorupdate( .and_then(|d| utils::extract_self_labels_profile(d)), extra_data: process_extra_data(&d.extra_data)?, }; - // TODO this should be a db.upsert(...).merge(...) - let _: Option = db - .upsert(("did", did_key)) - .content(profile) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.did.push(profile); + // // TODO this should be a db.upsert(...).merge(...) + // let _: Option = db + // .insert(("did", did_key)) + // .content(profile) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyGraphFollow(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -148,16 +433,32 @@ pub async fn on_commit_event_createorupdate( let to = utils::did_to_key(d.subject.as_str())?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE did:{}->follow->did:{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); - - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + // let query = format!( + // r#"RELATE type::thing("did", $from)->follow->type::thing("did", $to) SET id = $id, createdAt = $created_at;"# + // from, to, id, created_at + // ); + + big_update.follows.push(UpdateFollow { + from: RecordId::from(("did", from)), + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); + + big_update.latest_backfills.push(UpdateLatestBackfill { + of: RecordId::from(("did", to.clone())), + id: to, + }); + + // let _ = db + // .query("RELATE (type::thing('did', $from))->follow->(type::thing('did', $to)) SET id = $id, createdAt = $created_at; UPSERT (type::thing('latest_backfill', $to)) SET of = type::thing('did', $to);") + // .bind(("from", from)) + // .bind(("to", to)) + // .bind(("id", id)) + // . bind(("created_at", created_at)) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await.unwrap(); } KnownRecord::AppBskyFeedLike(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -166,16 +467,23 @@ pub async fn on_commit_event_createorupdate( let to = utils::at_uri_to_record_id(&d.subject.uri)?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE did:{}->like->{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); + big_update.likes.push(UpdateLike { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + // let query = format!( + // "RELATE did:{}->like->{} SET id = '{}', createdAt = {};", + // from, to, id, created_at + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } KnownRecord::AppBskyFeedRepost(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -184,16 +492,22 @@ pub async fn on_commit_event_createorupdate( let to = utils::at_uri_to_record_id(&d.subject.uri)?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE did:{}->repost->{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); - - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + big_update.reposts.push(UpdateRepost { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); + // let query = format!( + // "RELATE did:{}->repost->{} SET id = '{}', createdAt = {};", + // from, to, id, created_at + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } KnownRecord::AppBskyGraphBlock(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -202,16 +516,22 @@ pub async fn on_commit_event_createorupdate( let to = utils::did_to_key(d.subject.as_str())?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE did:{}->block->did:{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); - - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + big_update.blocks.push(UpdateBlock { + from: RecordId::from(("did", from)), + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); + // let query = format!( + // "RELATE did:{}->block->did:{} SET id = '{}', createdAt = {};", + // from, to, id, created_at + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } KnownRecord::AppBskyGraphListblock(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -220,16 +540,22 @@ pub async fn on_commit_event_createorupdate( let to = utils::at_uri_to_record_id(&d.subject)?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE did:{}->listblock->{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); - - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + big_update.listblocks.push(UpdateListBlock { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); + // let query = format!( + // "RELATE did:{}->listblock->{} SET id = '{}', createdAt = {};", + // from, to, id, created_at + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } KnownRecord::AppBskyGraphListitem(d) => { // TODO ensure_valid_rkey_strict(rkey.as_str())?; @@ -240,21 +566,29 @@ pub async fn on_commit_event_createorupdate( let to = utils::did_to_key(&d.subject)?; let created_at = utils::extract_dt(&d.created_at)?; - let query = format!( - "RELATE {}->listitem->did:{} SET id = '{}', createdAt = {};", - from, to, id, created_at - ); + big_update.listitems.push(UpdateListItem { + from: from, + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + // let query = format!( + // "RELATE {}->listitem->did:{} SET id = '{}', createdAt = {};", + // from, to, id, created_at + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } KnownRecord::AppBskyFeedGenerator(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let feed = BskyFeed { + let feed = UpdateFeed { + id: id, author: RecordId::from_table_key("did", did_key), avatar: None, // TODO implement created_at: utils::extract_dt(&d.created_at)?, @@ -269,18 +603,20 @@ pub async fn on_commit_event_createorupdate( ), extra_data: process_extra_data(&d.extra_data)?, }; - let _: Option = db - .upsert(("feed", id)) - .content(feed) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.feeds.push(feed); + // let _: Option = db + // .upsert(("feed", id)) + // .content(feed) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyGraphList(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let list = BskyList { + let list = UpdateList { + id: id, name: d.name.clone(), avatar: None, // TODO implement created_at: utils::extract_dt(&d.created_at)?, @@ -292,62 +628,70 @@ pub async fn on_commit_event_createorupdate( purpose: d.purpose.clone(), extra_data: process_extra_data(&d.extra_data)?, }; - let _: Option = db - .upsert(("list", id)) - .content(list) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.lists.push(list); + // let _: Option = db + // .upsert(("list", id)) + // .content(list) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyFeedThreadgate(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let _: Option = db - .upsert(("lex_app_bsky_feed_threadgate", id)) - .content(d) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.threadgates.push(WithId { id: id, data: d }); + // let _: Option = db + // .upsert(("lex_app_bsky_feed_threadgate", id)) + // .content(d) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyGraphStarterpack(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let _: Option = db - .upsert(("lex_app_bsky_graph_starterpack", id)) - .content(d) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.starterpacks.push(WithId { id: id, data: d }); + // let _: Option = db + // .upsert(("lex_app_bsky_graph_starterpack", id)) + // .content(d) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyFeedPostgate(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let _: Option = db - .upsert(("lex_app_bsky_feed_postgate", id)) - .content(d) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.postgates.push(WithId { id: id, data: d }); + // let _: Option = db + // .upsert(("lex_app_bsky_feed_postgate", id)) + // .content(d) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::ChatBskyActorDeclaration(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let _: Option = db - .upsert(("lex_chat_bsky_actor_declaration", id)) - .content(d) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update + .actordeclarations + .push(WithId { id: id, data: d }); + // let _: Option = db + // .upsert(("lex_chat_bsky_actor_declaration", id)) + // .content(d) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyLabelerService(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - let _: Option = db - .upsert(("lex_app_bsky_labeler_service", id)) - .content(d) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + big_update.labelerservices.push(WithId { id: id, data: d }); + // let _: Option = db + // .upsert(("lex_app_bsky_labeler_service", id)) + // .content(d) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; } KnownRecord::AppBskyFeedPost(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -421,23 +765,6 @@ pub async fn on_commit_event_createorupdate( } } - if let Some(r) = &record { - if r.table() == "post" { - let query = format!( - "RELATE post:{}->quotes->post:{} SET id = '{}';", - id, - r.key(), - id - ); - - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; - } - } - if let Some(facets) = &d.facets { for facet in facets { for feature in &facet.features { @@ -463,86 +790,129 @@ pub async fn on_commit_event_createorupdate( tags.extend(t.clone()); } - let post = BskyPost { - author: RecordId::from_table_key("did", did_key.clone()), - bridgy_original_url: None, - via: None, - created_at: utils::extract_dt(&d.created_at)?, - labels: d - .labels - .as_ref() - .and_then(|d| utils::extract_self_labels_post(d)), - text: d.text.clone(), - langs: d - .langs - .as_ref() - .map(|d| d.iter().map(|l| l.as_ref().to_string()).collect()), - root: d - .reply - .as_ref() - .map(|r| utils::strong_ref_to_record_id(&r.root)) - .transpose()?, - parent: d - .reply - .as_ref() - .map(|r| utils::strong_ref_to_record_id(&r.parent)) - .transpose()?, - video: video, - tags: if tags.is_empty() { None } else { Some(tags) }, - links: if links.is_empty() { None } else { Some(links) }, - mentions: if mentions.is_empty() { - None - } else { - Some(mentions) - }, - record: record, - images: if images.is_empty() { - None - } else { - Some(images) + if let Some(r) = &record { + if r.table() == "post" { + big_update.quotes.push(UpdateQuote { + from: RecordId::from_table_key("post", id.clone()), + to: r.clone(), + id: id.clone(), + }); + + // let query = format!( + // "RELATE post:{}->quotes->post:{} SET id = '{}';", + // id, + // r.key(), + // id + // ); + + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; + } + } + + let post = WithId { + id: id.clone(), + data: BskyPost { + author: RecordId::from_table_key("did", did_key.clone()), + bridgy_original_url: None, + via: None, + created_at: utils::extract_dt(&d.created_at)?, + labels: d + .labels + .as_ref() + .and_then(|d| utils::extract_self_labels_post(d)), + text: d.text.clone(), + langs: d + .langs + .as_ref() + .map(|d| d.iter().map(|l| l.as_ref().to_string()).collect()), + root: d + .reply + .as_ref() + .map(|r| utils::strong_ref_to_record_id(&r.root)) + .transpose()?, + parent: d + .reply + .as_ref() + .map(|r| utils::strong_ref_to_record_id(&r.parent)) + .transpose()?, + video: video, + tags: if tags.is_empty() { None } else { Some(tags) }, + links: if links.is_empty() { None } else { Some(links) }, + mentions: if mentions.is_empty() { + None + } else { + Some(mentions) + }, + record: record, + images: if images.is_empty() { + None + } else { + Some(images) + }, + extra_data: process_extra_data(&d.extra_data)?, }, - extra_data: process_extra_data(&d.extra_data)?, }; - let parent = post.parent.clone(); - let _: Option = db - .upsert(("post", id.clone())) - .content(post) - .into_future() - .instrument(span!(Level::INFO, "upsert")) - .await?; + + let parent = post.data.parent.clone(); + big_update.posts.push(post); + // let _: Option = db + // .upsert(("post", id.clone())) + // .content(post) + // .into_future() + // .instrument(span!(Level::INFO, "upsert")) + // .await?; if parent.is_some() { - let query1 = format!( - "RELATE did:{}->replies->post:{} SET id = '{}';", - did_key, id, id - ); - let _ = db - .query(query1) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; - - let query2 = format!( - "RELATE post:{}->replyto->{} SET id = '{}';", - id, - parent.unwrap(), - id - ); - let _ = db - .query(query2) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + big_update.replies_relations.push(UpdateRepliesRelation { + from: RecordId::from_table_key("did", did_key.clone()), + to: RecordId::from_table_key("post", id.clone()), + id: id.clone(), + }); + // let query1 = format!( + // "RELATE did:{}->replies->post:{} SET id = '{}';", + // did_key, id, id + // ); + // let _ = db + // .query(query1) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; + + big_update.reply_to_relations.push(UpdateReplyToRelation { + from: RecordId::from_table_key("post", id.clone()), + to: parent.unwrap(), + id: id.clone(), + }); + // let query2 = format!( + // "RELATE post:{}->replyto->{} SET id = '{}';", + // id, + // parent.unwrap(), + // id + // ); + // let _ = db + // .query(query2) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } else { - let query = format!( - "RELATE did:{}->posts->post:{} SET id = '{}';", - did_key, id, id - ); - let _ = db - .query(query) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; + big_update.posts_relations.push(UpdatePostsRelation { + from: RecordId::from_table_key("did", did_key.clone()), + to: RecordId::from_table_key("post", id.clone()), + id: id.clone(), + }); + // let query = format!( + // "RELATE did:{}->posts->post:{} SET id = '{}';", + // did_key, id, id + // ); + // let _ = db + // .query(query) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // .await?; } } _ => { @@ -551,7 +921,7 @@ pub async fn on_commit_event_createorupdate( } } - Ok(()) + Ok(big_update) } fn process_video(vid: &video::Main) -> Result { diff --git a/src/database/mod.rs b/src/database/mod.rs index 43c06e5..b3ef6ab 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,6 +1,18 @@ +use std::{sync::LazyLock, time::Duration}; + use anyhow::{Context, Result}; use definitions::{JetstreamCursor, Record}; -use surrealdb::{engine::any::Any, opt::auth::Root, RecordId, Surreal}; +use surrealdb::{ + engine::{ + any::Any, + remote::ws::{Client, Ws}, + }, + opt::{ + auth::{Credentials, Root}, + Config, + }, + RecordId, Surreal, +}; use tracing::{debug, info}; use crate::config::ARGS; @@ -10,6 +22,8 @@ pub mod handlers; pub mod repo_indexer; mod utils; +static DB: LazyLock> = LazyLock::new(Surreal::init); + /// Connect to the database pub async fn connect( db_endpoint: &str, @@ -22,6 +36,14 @@ pub async fn connect( let db = surrealdb::engine::any::connect(db_endpoint) .with_capacity(ARGS.surrealdb_capacity) .await?; + db.signin(Root { + username: "root", + password: "root", + }) + .await?; + + // let config = Config::default().query_timeout(Duration::from_millis(1500)); + // let dbb = DB.connect::("127.0.0.1:8000", Op) // sign in to the server diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 5fdabaf..bb694d2 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -12,15 +12,6 @@ use crate::config::ARGS; mod index_repo; mod repo_stream; -#[derive(Deserialize)] -struct BskyFollowRes { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: surrealdb::RecordId, -} - /// Database struct for a repo indexing timestamp #[derive(Debug, Serialize, Deserialize)] pub struct LastIndexedTimestamp { @@ -34,7 +25,7 @@ const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; // Make this less hacky macro_rules! stage { - ($metric:ident, $stage:literal, $next:literal, $item:ident -> $content:expr) => { + ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $item:ident -> $content:expr) => { |$item| async { // TODO: Dont create new keyvalues every time $metric.add( @@ -51,9 +42,19 @@ macro_rules! stage { KeyValue::new("state", "active"), ], ); + eprintln!("starting {}", $stage); + tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; + let before = std::time::Instant::now(); + eprintln!("finished {}", $stage); let result = $content; + let duration = before.elapsed(); + + $perfmetric.record( + duration.as_millis() as u64, + &[KeyValue::new("stage", $stage)], + ); $metric.add( -1, &[ @@ -113,9 +114,20 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .with_description("Track the number of tasks in the pipeline") .with_unit("repo") .build(); + let job_duration = meter + .u64_histogram("indexer.pipeline.duration") + .with_unit("ms") + .with_description("Pipeline job duration") + .with_boundaries( + vec![1, 3, 10, 31, 100, 316, 1000, 3160, 10000] + .iter() + .map(|x| *x as f64 + 1000.0) + .collect::>(), + ) + .build(); let mut res = db - .query("SELECT count() as c FROM li_did GROUP ALL;") + .query("SELECT count() as c FROM latest_backfill WHERE at != NONE GROUP ALL;") .await .unwrap(); let count = res.take::>((0, "c")).unwrap().unwrap_or(0); @@ -125,6 +137,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { repos_indexed.add(count as u64, &[]); let buffer_size = ARGS.pipeline_buffer_size; + let download_buffer_multiplier = ARGS.download_buffer_multiplier; RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) .map(|did| async { @@ -142,38 +155,47 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { item }) .buffer_unordered(buffer_size) - .map(stage!(tracker, "check_indexed", "get_service", item -> - item.check_indexed().await - )) + .map( + stage!(tracker, job_duration, "check_indexed", "get_service", item -> + item.check_indexed().await + ), + ) .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "get_service")) - .map(stage!(tracker, "get_service", "download_repo", item -> - item.get_service().await - )) + .map( + stage!(tracker, job_duration, "get_service", "download_repo", item -> + item.get_service().await + ), + ) .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "download_repo")) - .map(stage!(tracker, "download_repo", "deserialize_repo", item -> - item.download_repo().await - )) - .buffer_unordered(buffer_size) + .map( + stage!(tracker, job_duration, "download_repo", "deserialize_repo", item -> + item.download_repo().await + ), + ) + .buffer_unordered(buffer_size * download_buffer_multiplier) .filter_map(filter_result!(tracker, "deserialize_repo")) .map( - stage!(tracker, "deserialize_repo", "files_to_updates", item -> + stage!(tracker, job_duration, "deserialize_repo", "files_to_updates", item -> item.deserialize_repo().await ), ) .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "files_to_updates")) - .map(stage!(tracker, "files_to_updates", "apply_updates", item -> - item.files_to_updates().await - )) + .map( + stage!(tracker, job_duration, "files_to_updates", "apply_updates", item -> + item.files_to_updates().await + ), + ) .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "apply_updates")) - .map(stage!(tracker, "apply_updates", "print_report", item -> - { - // println!("Items: {:?}", item.state.updates.len()); - item.apply_updates().await} - )) + .map( + stage!(tracker, job_duration, "apply_updates", "print_report", item -> + // println!("Items: {:?}", item.state.updates.len()); + item.apply_updates().await + ), + ) .buffer_unordered(buffer_size) .filter_map(filter_result!(tracker, "print_report")) .for_each(|x| async { diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index d04b02b..876e884 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -1,21 +1,25 @@ -use super::LastIndexedTimestamp; -use crate::database::{definitions::Record, handlers::on_commit_event_createorupdate}; +use crate::{ + config::ARGS, + database::{ + definitions::Record, + handlers::{apply_big_update, on_commit_event_createorupdate, BigUpdate}, + }, +}; use atrium_api::{ record::KnownRecord, types::string::{Did, RecordKey}, }; -use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt}; -use hyper::body::Bytes; -use ipld_core::cid::{Cid, CidGeneric}; -use iroh_car::CarReader; +// use ipld_core::cid::{Cid, CidGeneric}; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{collections::HashMap, string::FromUtf8Error, sync::LazyLock, time::Duration}; -use surrealdb::{engine::any::Any, Surreal}; +use std::{collections::HashMap, io::Read, string::FromUtf8Error, sync::LazyLock, time::Duration}; +use surrealdb::{engine::any::Any, opt::PatchOp, Surreal}; use tokio::task::spawn_blocking; use tracing::{info, instrument, span, trace, warn, Level, Span}; +type Cid = ipld_core::cid::Cid; + /// There should only be one request client to make use of connection pooling // TODO: Dont use a global client static REQWEST_CLIENT: LazyLock = LazyLock::new(|| Client::new()); @@ -75,9 +79,9 @@ pub struct DatabaseUpdate { } /// Insert a file into a map -async fn insert_into_map( +fn insert_into_map( mut files: HashMap>, - file: (CidGeneric<64>, Vec), + file: (Cid, Vec), ) -> anyhow::Result>> { let (cid, data) = file; files.insert(cid, data); @@ -121,17 +125,6 @@ fn files_to_updates_blocking( return Ok(result); } -/// Check if a repo is already indexed -#[instrument()] -async fn check_indexed(db: &Surreal, did: &str) -> anyhow::Result { - let did_key = crate::database::utils::did_to_key(did)?; - - Ok(db - .select::>(("li_did", &did_key)) - .await? - .is_some()) -} - /// Get the plc response service for the repo #[instrument(skip_all)] async fn get_plc_service( @@ -153,7 +146,7 @@ async fn get_plc_service( async fn download_repo( service: &PlcDirectoryDidResponseService, did: &str, -) -> anyhow::Result { +) -> anyhow::Result> { let get_repo_response = REQWEST_CLIENT .get(format!( "{}/xrpc/com.atproto.sync.getRepo?did={}", @@ -161,7 +154,7 @@ async fn download_repo( )) .send() .await?; - let bytes = get_repo_response.bytes().await?; + let bytes = get_repo_response.bytes().await?.to_vec(); info!( "Downloaded repo {} with size {:.2} MB", did, @@ -172,14 +165,17 @@ async fn download_repo( /// Download the file for the given repo into a map #[instrument(skip_all)] -async fn deserialize_repo(bytes: Bytes) -> anyhow::Result>> { - // TODO: Benchmark CarReader. This is probably not the right place for parsing logic - let car_reader = CarReader::new(bytes.as_ref()).await?; - let files = car_reader - .stream() - .map_err(|e| e.into()) - .try_fold(HashMap::new(), insert_into_map) - .await; +fn deserialize_repo(mut bytes: Vec) -> anyhow::Result>> { + let (entries, header) = rs_car_sync::car_read_all(&mut bytes.as_slice(), true)?; + // let car_reader = CarReader::new(bytes.as_ref()).await?; + let files = entries + .into_iter() + .map(|(cid, data)| { + let cid_bytes = cid.to_bytes(); + let cid: Cid = ipld_core::cid::Cid::read_bytes(cid_bytes.as_slice()).unwrap(); + (cid, data) + }) + .try_fold(HashMap::new(), insert_into_map); files } @@ -201,41 +197,60 @@ async fn apply_updates( update_timestamp: &Duration, ) -> anyhow::Result<()> { let did_key = crate::database::utils::did_to_key(did)?; - - let mut futures: FuturesUnordered<_> = updates - .into_iter() - .map(|update| async { - let db = db.clone(); - let did_key = did_key.clone(); - let did = did.to_string(); - - let res = on_commit_event_createorupdate( - &db, - Did::new(did.clone().into()).unwrap(), - did_key, - update.collection, - update.rkey, - update.record, - ) - .await; - - if let Err(error) = res { - warn!("on_commit_event_createorupdate {} {}", error, did); + if !ARGS.dont_write_when_backfilling.unwrap_or(false) { + let did = did.to_owned(); + let did_key = did_key.to_owned(); + let big_update = tokio::task::spawn_blocking(move || { + let mut futures = updates.into_iter().map(|update| { + let did_key = did_key.clone(); + let did = did.to_string(); + + let res = on_commit_event_createorupdate( + Did::new(did.clone().into()).unwrap(), + did_key, + update.collection, + update.rkey, + update.record, + ); + + match res { + Ok(big_update) => { + return Ok(big_update); + } + Err(e) => { + warn!("on_commit_event_createorupdate {} {}", e, did); + return Err(e); + } + } + }); + let mut really_big_update = BigUpdate::default(); + loop { + let Some(result) = futures.next() else { + break; + }; + match result { + Ok(big_update) => { + really_big_update.merge(big_update); + } + Err(e) => { + warn!("Failed to apply update: {}", e); + return Err(e); + } + } } + Ok(really_big_update) }) - .collect(); - loop { - let Some(_) = futures.next().await else { break }; + .await??; + apply_big_update(db, big_update).await?; } - let _: Option = db - .upsert(("li_did", did_key.clone())) - .content(LastIndexedTimestamp { - time_us: update_timestamp.as_micros() as u64, - time_dt: chrono::Utc::now().into(), - error: None, - }) + .update(("latest_backfill", did_key.clone())) + .patch(PatchOp::replace( + "at", + surrealdb::sql::Datetime::from(chrono::Utc::now()), + )) .await?; + Ok(()) } @@ -280,7 +295,7 @@ pub struct WithService { /// Has files pub struct WithRepo { now: std::time::Duration, - repo: Bytes, + repo: Vec, } pub struct WithFiles { @@ -323,10 +338,8 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn check_indexed(self) -> anyhow::Result> { - if check_indexed(&self.db, &self.did).await? { - // TODO: Handle this better, as this is not really an error - return Err(anyhow::anyhow!("Already indexed")); - } + // TODO: Obsolete, remove this + Ok(PipelineItem:: { state: NotIndexed {}, db: self.db, @@ -380,8 +393,8 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn deserialize_repo(self) -> anyhow::Result> { - info!("Deserializing repo {}", self.did); - let files = deserialize_repo(self.state.repo).await?; + // info!("Deserializing repo {}", self.did); + let files = spawn_blocking(|| deserialize_repo(self.state.repo)).await??; Ok(PipelineItem:: { state: WithFiles { now: self.state.now, diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index d3ff15f..0d62f15 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -5,10 +5,11 @@ use std::{ }; use futures::Stream; +use serde::Deserialize; use surrealdb::{engine::any::Any, Surreal}; use tracing::info; -use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; +use crate::database::utils::unsafe_user_key_to_did; pub struct RepoStream { buffer: VecDeque, @@ -22,6 +23,12 @@ pub struct RepoStream { >, } +#[derive(Deserialize)] +struct LatestBackfill { + pub at: Option, + pub of: surrealdb::RecordId, +} + impl RepoStream { pub fn new(anchor: String, db: Surreal) -> Self { return Self { @@ -63,16 +70,15 @@ impl Stream for RepoStream { if let Some(next) = self.buffer.pop_front() { return Poll::Ready(Some(next)); } + eprintln!("RepoStream not ready, fetching more data"); info!(target: "indexer", "Discovering follows starting from {}", self.anchor); if self.db_future.is_none() { self.db_future = Some( self.db // TODO: Fix the possible SQL injection - .query(format!( - "SELECT id,in,out FROM follow:{}.. LIMIT {};", - self.anchor, FETCH_AMOUNT - )) + .query(r#"SELECT of FROM latest_backfill WHERE at IS NONE LIMIT $limit;"#) + .bind(("limit", FETCH_AMOUNT)) .into_owned() .into_future(), ); @@ -86,24 +92,31 @@ impl Stream for RepoStream { let mut result = result.unwrap(); - let follows: Vec = result.take(0).unwrap(); + let follows: Vec = result.take(0).unwrap(); - let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { - // TODO: Sleep again - return Poll::Pending; - }; - self.anchor = format!("{}", anchor_key); - - for follow in &follows { - for record_id in [&follow.from, &follow.to] { - let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); - if self.processed_dids.contains(&did) { - continue; - } - self.processed_dids.insert(did.clone()); - self.buffer.push_back(did); + // let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { + // // TODO: Sleep again + // return Poll::Pending; + // }; + // self.anchor = format!("{}", anchor_key); + + let starttime = std::time::Instant::now(); + for latest_backfill in &follows { + let key = latest_backfill.of.key().to_string(); + if self.processed_dids.contains(&key) { + continue; } + self.processed_dids.insert(key); + // TODO: Investigate if we can just use the RecordId directly + let did = unsafe_user_key_to_did(&format!("{}", latest_backfill.of.key())); + self.buffer.push_back(did); } + let duration = starttime.elapsed(); + eprintln!( + "RepoStream processed {} records in {}ms", + follows.len(), + duration.as_millis() + ); if let Some(next) = self.buffer.pop_front() { return Poll::Ready(Some(next)); diff --git a/src/main.rs b/src/main.rs index cb0f2cf..6ee2856 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,10 +33,12 @@ fn main() { let mut rt_builder = Builder::new_multi_thread(); rt_builder .enable_all() + .worker_threads(32) .max_blocking_threads(512 * 512) .enable_time() .enable_io() .max_io_events_per_tick(1024 * 512) + .global_queue_interval(20) .thread_name_fn(|| { static ATOMIC: AtomicUsize = AtomicUsize::new(0); let id = ATOMIC.fetch_add(1, Ordering::Relaxed); From 90bf788b6ec449eddc6e04c9eab0f0f9643ea81b Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 4 Mar 2025 19:45:40 +0100 Subject: [PATCH 43/75] [WIP] Commit dirty worktree for easier testing on hetzner --- Cargo.toml | 3 + src/config.rs | 31 +- src/database/definitions.rs | 99 ++-- src/database/handlers.rs | 387 ++++++++++++--- src/database/repo_indexer.rs | 512 +++++++++++++++++--- src/database/repo_indexer/buffered_items.rs | 190 ++++++++ src/database/repo_indexer/index_repo.rs | 197 ++++---- src/main.rs | 3 +- src/observability.rs | 135 +++--- 9 files changed, 1192 insertions(+), 365 deletions(-) create mode 100644 src/database/repo_indexer/buffered_items.rs diff --git a/Cargo.toml b/Cargo.toml index 1f85a94..e927459 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,6 +77,9 @@ opentelemetry-resource-detectors = "0.7.0" sys-info = "0.9.1" sysinfo = "0.33.1" rs-car-sync = "0.4.1" +serde_with = "3.12.0" +pin-project-lite = "0.2.16" +pumps = "0.0.4" [profile.release] lto = "thin" diff --git a/src/config.rs b/src/config.rs index 11dc1d0..d9de1c0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -35,24 +35,41 @@ pub struct Args { /// Capacity of the surrealdb connection. 0 means unbounded #[arg(long, default_value = "0")] pub surrealdb_capacity: usize, - /// Size of the buffer between each pipeline stage in elements - #[arg(long, default_value = "10")] - pub pipeline_buffer_size: usize, - /// Multiply the size of the download buffer by this factor - #[arg(long, default_value = "5")] - pub download_buffer_multiplier: usize, /// Enable tokio console support #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] pub console: Option, /// Enable opentelemetry tracing support #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] pub otel_tracing: Option, + /// Enable opentelemetry metrics support + #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] + pub otel_metrics: Option, /// Enable opentelemetry #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub otel: Option, + pub otel_logs: Option, /// Dont write to the database when backfilling #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] pub dont_write_when_backfilling: Option, + /// Size of the buffer between each pipeline stage in elements + #[arg(long, default_value = "200")] + pub pipeline_buffer_size: usize, + /// Number of concurrent elements in each pipeline stage + #[arg(long, default_value = "50")] + pub pipeline_concurrent_elements: usize, + /// Multiply the number of concurrent download repo tasks by this factor + #[arg(long, default_value = "4")] + pub pipeline_download_concurrency_multiplier: usize, + /// Timeout for a pipeline stage in seconds. No pipeline stage should take longer than this + #[arg(long, default_value = "350")] + pub pipeline_stage_timeout: u64, + /// Timeout for the repo downloading pipeline stage in seconds. + /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used + #[arg(long, default_value = "300")] + pub repo_download_timeout: u64, + /// Timeout for downloading information from the directory in seconds. + /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used + #[arg(long, default_value = "60")] + pub directory_download_timeout: u64, } pub const ARGS: LazyLock = LazyLock::new(|| Args::parse()); diff --git a/src/database/definitions.rs b/src/database/definitions.rs index bd479c7..cb1841f 100644 --- a/src/database/definitions.rs +++ b/src/database/definitions.rs @@ -232,13 +232,12 @@ DEFINE FIELD avatar ON TABLE list TYPE option>; DEFINE FIELD labels ON TABLE list TYPE option>; DEFINE FIELD extraData ON TABLE list TYPE option; +DEFINE TABLE block SCHEMAFULL TYPE RELATION FROM did TO did; +DEFINE FIELD createdAt ON TABLE block TYPE datetime; DEFINE TABLE follow SCHEMAFULL TYPE RELATION FROM did TO did; DEFINE FIELD createdAt ON TABLE follow TYPE datetime; -DEFINE TABLE block SCHEMAFULL TYPE RELATION FROM did TO did; -DEFINE FIELD createdAt ON TABLE block TYPE datetime; - DEFINE TABLE like SCHEMAFULL TYPE RELATION FROM did TO post|feed|list|starterpack|labeler; DEFINE FIELD createdAt ON TABLE like TYPE datetime; @@ -255,53 +254,53 @@ DEFINE TABLE replyto SCHEMAFULL TYPE RELATION FROM post TO post; DEFINE TABLE repost SCHEMAFULL TYPE RELATION FROM did TO post; DEFINE FIELD createdAt ON TABLE repost TYPE datetime; -DEFINE TABLE like_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->out.id AS out - FROM like - GROUP BY out -; - -DEFINE TABLE repost_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->out.id AS out - FROM repost - GROUP BY out -; - -DEFINE TABLE reply_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->out.id AS out - FROM replyto - GROUP BY out -; - -DEFINE TABLE quote_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->out.id AS out - FROM quotes - GROUP BY out -; - -DEFINE TABLE following_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->in.id AS in - FROM follow - GROUP BY in -; - -DEFINE TABLE follower_count_view TYPE NORMAL AS -SELECT - count() AS c, - ->out.id AS out - FROM follow - GROUP BY out -; +// DEFINE TABLE like_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->out.id AS out +// FROM like +// GROUP BY out +// ; + +// DEFINE TABLE repost_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->out.id AS out +// FROM repost +// GROUP BY out +// ; + +// DEFINE TABLE reply_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->out.id AS out +// FROM replyto +// GROUP BY out +// ; + +// DEFINE TABLE quote_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->out.id AS out +// FROM quotes +// GROUP BY out +// ; + +// DEFINE TABLE following_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->in.id AS in +// FROM follow +// GROUP BY in +// ; + +// DEFINE TABLE follower_count_view TYPE NORMAL AS +// SELECT +// count() AS c, +// ->out.id AS out +// FROM follow +// GROUP BY out +// ; DEFINE TABLE latest_backfill SCHEMAFULL; DEFINE FIELD of ON TABLE latest_backfill TYPE record; diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 738ebad..0230c89 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -10,7 +10,9 @@ use atrium_api::{ }, }; use chrono::Utc; +use futures::FutureExt; use serde::{Deserialize, Serialize}; +use serde_with::skip_serializing_none; use std::future::IntoFuture; use std::time::Instant; use surrealdb::method::Query; @@ -50,7 +52,7 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { } => { let big_update = on_commit_event_createorupdate(did, did_key, collection, rkey, record)?; - apply_big_update(db, big_update).await?; + big_update.apply(db).await?; } Commit::Delete { rev, @@ -164,10 +166,12 @@ struct UpdateListItem { pub created_at: surrealdb::Datetime, } +#[skip_serializing_none] #[derive(Serialize)] struct UpdateLatestBackfill { - pub of: surrealdb::RecordId, - pub id: String, + of: surrealdb::RecordId, + id: String, + at: Option, } /// Database struct for a bluesky profile @@ -273,6 +277,8 @@ pub struct BigUpdate { did: Vec, follows: Vec, latest_backfills: Vec, + /// Like latest_backfills but overwrites existing records + overwrite_latest_backfills: Vec, likes: Vec, reposts: Vec, blocks: Vec, @@ -314,65 +320,331 @@ impl BigUpdate { self.replies_relations.extend(other.replies_relations); self.reply_to_relations.extend(other.reply_to_relations); self.posts_relations.extend(other.posts_relations); + self.overwrite_latest_backfills + .extend(other.overwrite_latest_backfills); } -} -pub async fn apply_big_update(db: &Surreal, big_update: BigUpdate) -> Result<()> { - //TODO: Bundle this into a function - let query_string = r#" - INSERT IGNORE INTO did $dids; - INSERT RELATION INTO follow $follows; - INSERT IGNORE INTO latest_backfill $latest_backfills; - INSERT RELATION INTO like $likes; - INSERT RELATION INTO repost $reposts; - INSERT RELATION INTO block $blocks; - INSERT RELATION INTO listblock $listblocks; - INSERT RELATION INTO listitem $listitems; - INSERT IGNORE INTO feed $feeds; - INSERT IGNORE INTO list $lists; - INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates; - INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks; - INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates; - INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations; - INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices; - INSERT RELATION INTO quote $quotes; - INSERT IGNORE INTO posts $posts; - INSERT RELATION INTO replyto $reply_to_relations; - INSERT RELATION INTO quotes $quotes; - INSERT RELATION INTO replies $replies_relations; - "#; - - let start = Instant::now(); - db.query(query_string) - .bind(("dids", big_update.did)) - .bind(("follows", big_update.follows)) - .bind(("latest_backfills", big_update.latest_backfills)) - .bind(("likes", big_update.likes)) - .bind(("reposts", big_update.reposts)) - .bind(("blocks", big_update.blocks)) - .bind(("listblocks", big_update.listblocks)) - .bind(("listitems", big_update.listitems)) - .bind(("feeds", big_update.feeds)) - .bind(("lists", big_update.lists)) - .bind(("threadgates", big_update.threadgates)) - .bind(("starterpacks", big_update.starterpacks)) - .bind(("postgates", big_update.postgates)) - .bind(("actordeclarations", big_update.actordeclarations)) - .bind(("labelerservices", big_update.labelerservices)) - .bind(("quotes", big_update.quotes)) - .bind(("posts", big_update.posts)) - .bind(("replies_relations", big_update.replies_relations)) - .bind(("reply_to_relations", big_update.reply_to_relations)) - .bind(("posts_relations", big_update.posts_relations)) - .into_future() - .instrument(span!(Level::INFO, "query")) - .await?; - let duration = start.elapsed(); - warn!("Big update took {:?}", duration); + pub fn add_timestamp(&mut self, did: &str, time: surrealdb::sql::Datetime) { + self.overwrite_latest_backfills.push(UpdateLatestBackfill { + of: RecordId::from(("did", did)), + id: did.to_string(), + at: Some(time), + }); + } - Ok(()) + pub async fn apply(self, db: &Surreal) -> Result<()> { + let format_output = tokio::task::block_in_place(|| format!("{:?}", &self)); + //TODO: Bundle this into a function + let query_string = r#" + INSERT IGNORE INTO did $dids RETURN NONE; + INSERT IGNORE INTO latest_backfill $latest_backfills RETURN NONE; + INSERT IGNORE INTO feed $feeds RETURN NONE; + INSERT IGNORE INTO list $lists RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates RETURN NONE; + INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices RETURN NONE; + INSERT IGNORE INTO posts $posts RETURN NONE; + INSERT RELATION INTO quotes $quotes RETURN NONE; + INSERT RELATION INTO like $likes RETURN NONE; + INSERT RELATION INTO repost $reposts RETURN NONE; + INSERT RELATION INTO block $blocks RETURN NONE; + INSERT RELATION INTO listblock $listblocks RETURN NONE; + INSERT RELATION INTO listitem $listitems RETURN NONE; + INSERT RELATION INTO replyto $reply_to_relations RETURN NONE; + INSERT RELATION INTO quotes $quotes RETURN NONE; + INSERT RELATION INTO replies $replies_relations RETURN NONE; + INSERT RELATION INTO follow $follows RETURN NONE; + INSERT INTO latest_backfill $overwrite_latest_backfill RETURN NONE; + "#; + + let before_update = Instant::now(); + let update = tokio::task::block_in_place(|| { + db.query(query_string) + .bind(("dids", self.did)) + .bind(("follows", self.follows)) + .bind(("latest_backfills", self.latest_backfills)) + .bind(("likes", self.likes)) + .bind(("reposts", self.reposts)) + .bind(("blocks", self.blocks)) + .bind(("listblocks", self.listblocks)) + .bind(("listitems", self.listitems)) + .bind(("feeds", self.feeds)) + .bind(("lists", self.lists)) + .bind(("threadgates", self.threadgates)) + .bind(("starterpacks", self.starterpacks)) + .bind(("postgates", self.postgates)) + .bind(("actordeclarations", self.actordeclarations)) + .bind(("labelerservices", self.labelerservices)) + .bind(("quotes", self.quotes)) + .bind(("posts", self.posts)) + .bind(("replies_relations", self.replies_relations)) + .bind(("reply_to_relations", self.reply_to_relations)) + .bind(("posts_relations", self.posts_relations)) + .bind(("overwrite_latest_backfill", self.overwrite_latest_backfills)) + .into_future() + .instrument(span!(Level::INFO, "query")) + }); + let duration = before_update.elapsed(); + let after_update = Instant::now(); + update.await?; + let update_duration = after_update.elapsed(); + eprintln!( + "Update creation took {}ms, execution took {}ms; update: {}", + duration.as_millis(), + update_duration.as_millis(), + format_output + ); + + Ok(()) + } } +impl core::fmt::Debug for BigUpdate { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let did_size = self + .did + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let follows_size = self + .follows + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let latest_backfills_size = self + .latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let likes_size = self + .likes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let reposts_size = self + .reposts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let blocks_size = self + .blocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let listblocks_size = self + .listblocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let listitems_size = self + .listitems + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let feeds_size = self + .feeds + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let lists_size = self + .lists + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let threadgates_size = self + .threadgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let starterpacks_size = self + .starterpacks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let postgates_size = self + .postgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let actordeclarations_size = self + .actordeclarations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let labelerservices_size = self + .labelerservices + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let quotes_size = self + .quotes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let posts_size = self + .posts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let replies_relations_size = self + .replies_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let reply_to_relations_size = self + .reply_to_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let posts_relations_size = self + .posts_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let overwrite_latest_backfills_size = self + .overwrite_latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) + .sum::(); + let number_relations = self.follows.len() + + self.likes.len() + + self.reposts.len() + + self.blocks.len() + + self.listblocks.len() + + self.listitems.len() + + self.replies_relations.len() + + self.reply_to_relations.len() + + self.posts_relations.len() + + self.quotes.len(); + let number_inserts = self.did.len() + + self.latest_backfills.len() + + self.feeds.len() + + self.lists.len() + + self.threadgates.len() + + self.starterpacks.len() + + self.postgates.len() + + self.actordeclarations.len() + + self.labelerservices.len() + + self.posts.len() + + self.overwrite_latest_backfills.len(); + let number_total = number_relations + number_inserts; + let size_relations = replies_relations_size + + reply_to_relations_size + + posts_relations_size + + quotes_size + + likes_size + + reposts_size + + blocks_size + + listblocks_size + + listitems_size; + let size_inserts = did_size + + latest_backfills_size + + feeds_size + + lists_size + + threadgates_size + + starterpacks_size + + postgates_size + + actordeclarations_size + + labelerservices_size + + posts_size + + overwrite_latest_backfills_size; + let size_total = size_relations + size_inserts; + f.debug_struct("BigUpdate") + .field("updates", &number_total) + .field("updates_size_mb", &(size_total as f64 / 1024.0 / 1024.0)) + .field("number_relations", &number_relations) + .field("number_inserts", &number_inserts) + .field( + "size_relations_mb", + &(size_relations as f64 / 1024.0 / 1024.0), + ) + .field("size_inserts_mb", &(size_inserts as f64 / 1024.0 / 1024.0)) + .field("did", &self.did.len()) + .field("did_size_mb", &(did_size as f64 / 1024.0 / 1024.0)) + .field("follows", &self.follows.len()) + .field("follows_size_mb", &(follows_size as f64 / 1024.0 / 1024.0)) + .field("latest_backfills", &self.latest_backfills.len()) + .field( + "latest_backfills_size_mb", + &(latest_backfills_size as f64 / 1024.0 / 1024.0), + ) + .field("likes", &self.likes.len()) + .field("likes_size_mb", &(likes_size as f64 / 1024.0 / 1024.0)) + .field("reposts", &self.reposts.len()) + .field("reposts_size_mb", &(reposts_size as f64 / 1024.0 / 1024.0)) + .field("blocks", &self.blocks.len()) + .field("blocks_size_mb", &(blocks_size as f64 / 1024.0 / 1024.0)) + .field("listblocks", &self.listblocks.len()) + .field( + "listblocks_size_mb", + &(listblocks_size as f64 / 1024.0 / 1024.0), + ) + .field("listitems", &self.listitems.len()) + .field( + "listitems_size_mb", + &(listitems_size as f64 / 1024.0 / 1024.0), + ) + .field("feeds", &self.feeds.len()) + .field("feeds_size_mb", &(feeds_size as f64 / 1024.0 / 1024.0)) + .field("lists", &self.lists.len()) + .field("lists_size_mb", &(lists_size as f64 / 1024.0 / 1024.0)) + .field("threadgates", &self.threadgates.len()) + .field( + "threadgates_size_mb", + &(threadgates_size as f64 / 1024.0 / 1024.0), + ) + .field("starterpacks", &self.starterpacks.len()) + .field( + "starterpacks_size_mb", + &(starterpacks_size as f64 / 1024.0 / 1024.0), + ) + .field("postgates", &self.postgates.len()) + .field( + "postgates_size_mb", + &(postgates_size as f64 / 1024.0 / 1024.0), + ) + .field("actordeclarations", &self.actordeclarations.len()) + .field( + "actordeclarations_size_mb", + &(actordeclarations_size as f64 / 1024.0 / 1024.0), + ) + .field("labelerservices", &self.labelerservices.len()) + .field( + "labelerservices_size_mb", + &(labelerservices_size as f64 / 1024.0 / 1024.0), + ) + .field("quotes", &self.quotes.len()) + .field("quotes_size_mb", &(quotes_size as f64 / 1024.0 / 1024.0)) + .field("posts", &self.posts.len()) + .field("posts_size_mb", &(posts_size as f64 / 1024.0 / 1024.0)) + .field("replies_relations", &self.replies_relations.len()) + .field( + "replies_relations_size_mb", + &(replies_relations_size as f64 / 1024.0 / 1024.0), + ) + .field("reply_to_relations", &self.reply_to_relations.len()) + .field( + "reply_to_relations_size_mb", + &(reply_to_relations_size as f64 / 1024.0 / 1024.0), + ) + .field("posts_relations", &self.posts_relations.len()) + .field( + "posts_relations_size_mb", + &(posts_relations_size as f64 / 1024.0 / 1024.0), + ) + .field( + "overwrite_latest_backfills", + &self.overwrite_latest_backfills.len(), + ) + .field( + "overwrite_latest_backfills_size_mb", + &(overwrite_latest_backfills_size as f64 / 1024.0 / 1024.0), + ) + .finish() + } +} /// If the new commit is a create or update, handle it #[instrument(skip(record))] pub fn on_commit_event_createorupdate( @@ -448,6 +720,7 @@ pub fn on_commit_event_createorupdate( big_update.latest_backfills.push(UpdateLatestBackfill { of: RecordId::from(("did", to.clone())), id: to, + at: None, }); // let _ = db diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index bb694d2..253a0a3 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,6 +1,9 @@ +use std::{os::unix::process, sync::LazyLock}; + use futures::StreamExt; use index_repo::PipelineItem; use opentelemetry::{global, KeyValue}; +use pumps::Concurrency; use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; @@ -9,6 +12,7 @@ use tracing::{error, warn}; use crate::config::ARGS; +// mod buffered_items; mod index_repo; mod repo_stream; @@ -47,7 +51,7 @@ macro_rules! stage { let before = std::time::Instant::now(); eprintln!("finished {}", $stage); - let result = $content; + let result = tokio::task::spawn($content).await?; let duration = before.elapsed(); @@ -75,6 +79,101 @@ macro_rules! stage { }; } +macro_rules! done { + ("done", $a:expr, $b:expr) => { + $a + }; + ($idk:literal, $a:expr, $b:expr) => { + $b + }; +} + +// Make this less hacky +macro_rules! pump_stage { + ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $function:ident) => { + |x| async { + eprintln!("starting {}", $stage); + + // TODO: Dont create new keyvalues every time + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "queued"), + ], + ); + $metric.add( + 1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; + let before = std::time::Instant::now(); + let result = tokio::task::spawn(tokio::time::timeout( + tokio::time::Duration::from_secs(ARGS.pipeline_stage_timeout), + x.$function(), + )) + .await; + let duration = before.elapsed(); + eprintln!( + "pre finished {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + let Ok(result) = result else { + panic!("Spawn error in {}", $stage); + }; + let Ok(result) = result else { + panic!("Timeout in {}", $stage); + }; + + // $perfmetric.record( + // duration.as_millis() as u64, + // &[KeyValue::new("stage", $stage)], + // ); + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + + let result = match result { + Err(error) => { + eprintln!( + "failed {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + + // error!(target: "indexer", "Failed to index repo: {}", error); + return None; + } + Ok(result) => result, + }; + + if $next != "done" { + $metric.add( + 1, + &[ + KeyValue::new("stage", $next), + KeyValue::new("state", "queued"), + ], + ); + } + eprintln!( + "finished {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + return Some(result); + } + }; +} + // Make this less hacky macro_rules! filter_result { ($metric:ident, $stage:literal) => {|result| async { @@ -100,6 +199,14 @@ macro_rules! filter_result { // result.ok() // } +const tracker: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .i64_up_down_counter("indexer.pipeline.location") + .with_description("Track the number of tasks in the pipeline") + .with_unit("repo") + .build() +}); + pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let http_client = Client::new(); @@ -109,11 +216,11 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .with_description("Total number of indexed repos") .with_unit("repo") .build(); - let tracker = meter - .i64_up_down_counter("indexer.pipeline.location") - .with_description("Track the number of tasks in the pipeline") - .with_unit("repo") - .build(); + // let tracker: opentelemetry::metrics::UpDownCounter = meter + // .i64_up_down_counter("indexer.pipeline.location") + // .with_description("Track the number of tasks in the pipeline") + // .with_unit("repo") + // .build(); let job_duration = meter .u64_histogram("indexer.pipeline.duration") .with_unit("ms") @@ -137,94 +244,339 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { repos_indexed.add(count as u64, &[]); let buffer_size = ARGS.pipeline_buffer_size; - let download_buffer_multiplier = ARGS.download_buffer_multiplier; + let download_concurrency_multiplier = ARGS.pipeline_download_concurrency_multiplier; + let concurrent_elements = ARGS.pipeline_concurrent_elements; - RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) - .map(|did| async { - let db = db.clone(); - let http_client = http_client.clone(); - let item = PipelineItem::new(db, http_client, did); + let repo_stream = RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()); + let dids = repo_stream.map(move |x| (x.to_string(), db.clone(), http_client.clone())); - tracker.add( - 1, - &[ - KeyValue::new("stage", "check_indexed"), - KeyValue::new("state", "queued"), - ], - ); - item - }) - .buffer_unordered(buffer_size) + // let urls_list = vec!["1"; 1000]; + // let urls = urls_list + // .into_iter() + // .map(move |x| (x.to_string(), db.clone(), http_client.clone())); + + let (mut output_receiver, _join_handle) = pumps::Pipeline::from_stream(dids) .map( - stage!(tracker, job_duration, "check_indexed", "get_service", item -> - item.check_indexed().await - ), + |(did, db, http_client)| async { + let item = PipelineItem::new(db, http_client, did); + + tracker.add( + 1, + &[ + KeyValue::new("stage", "get_service"), + KeyValue::new("state", "queued"), + ], + ); + item + }, + Concurrency::concurrent_unordered(concurrent_elements), ) - .buffer_unordered(buffer_size) - .filter_map(filter_result!(tracker, "get_service")) - .map( - stage!(tracker, job_duration, "get_service", "download_repo", item -> - item.get_service().await + .backpressure(buffer_size) + .filter_map( + pump_stage!( + tracker, + job_duration, + "get_service", + "download_repo", + get_service ), + Concurrency::concurrent_unordered(concurrent_elements), ) - .buffer_unordered(buffer_size) - .filter_map(filter_result!(tracker, "download_repo")) - .map( - stage!(tracker, job_duration, "download_repo", "deserialize_repo", item -> - item.download_repo().await + .backpressure(buffer_size) + .filter_map( + pump_stage!( + tracker, + job_duration, + "download_repo", + "process_repo", + download_repo ), - ) - .buffer_unordered(buffer_size * download_buffer_multiplier) - .filter_map(filter_result!(tracker, "deserialize_repo")) - .map( - stage!(tracker, job_duration, "deserialize_repo", "files_to_updates", item -> - item.deserialize_repo().await + Concurrency::concurrent_unordered( + concurrent_elements * download_concurrency_multiplier, ), ) - .buffer_unordered(buffer_size) - .filter_map(filter_result!(tracker, "files_to_updates")) - .map( - stage!(tracker, job_duration, "files_to_updates", "apply_updates", item -> - item.files_to_updates().await + .backpressure(buffer_size) + .filter_map( + pump_stage!( + tracker, + job_duration, + "process_repo", + "apply_updates", + process_repo ), + Concurrency::concurrent_unordered(concurrent_elements), ) - .buffer_unordered(buffer_size) - .filter_map(filter_result!(tracker, "apply_updates")) - .map( - stage!(tracker, job_duration, "apply_updates", "print_report", item -> - // println!("Items: {:?}", item.state.updates.len()); - item.apply_updates().await + .backpressure(buffer_size) + .filter_map( + pump_stage!( + tracker, + job_duration, + "apply_updates", + "print_report", + apply_updates ), + Concurrency::concurrent_unordered(concurrent_elements), ) - .buffer_unordered(buffer_size) - .filter_map(filter_result!(tracker, "print_report")) - .for_each(|x| async { - tracker.add( - -1, - &[ - KeyValue::new("stage", "print_report"), - KeyValue::new("state", "queued"), - ], - ); - tracker.add( - 1, - &[ - KeyValue::new("stage", "print_report"), - KeyValue::new("state", "active"), - ], - ); - x.print_report().await; - tracker.add( - -1, - &[ - KeyValue::new("stage", "print_report"), - KeyValue::new("state", "active"), - ], - ); - repos_indexed.add(1, &[]); - }) - .await; + .backpressure(buffer_size) + .filter_map( + pump_stage!(tracker, job_duration, "print_report", "done", print_report), + Concurrency::concurrent_unordered(concurrent_elements), + ) + .backpressure(buffer_size) + // .map(download_heavy_resource, Concurrency::serial()) + // .filter_map(run_algorithm, Concurrency::concurrent_unordered(concurrent_elements)) + // .map(save_to_db, Concurrency::concurrent_unordered(100)) + .build(); + // join_handle.await; + let mut elements = 0; + loop { + let Some(_result) = output_receiver.recv().await else { + panic!("Done, this should not happen"); + }; + elements += 1; + repos_indexed.add(1, &[]); + eprintln!("Finished: {}", elements); + } + + // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) + // .map(|did| async { + // let db = db.clone(); + // let http_client = http_client.clone(); + // let item = PipelineItem::new(db, http_client, did); + + // tracker.add( + // 1, + // &[ + // KeyValue::new("stage", "get_service"), + // KeyValue::new("state", "queued"), + // ], + // ); + // item + // }) + // .buffer_unordered(buffer_size) + // .map( + // stage!(tracker, job_duration, "get_service", "download_repo", item -> + // item.get_service() + // ), + // ) + // .buffer_unordered(buffer_size) + // .filter_map(filter_result!(tracker, "download_repo")) + // .map( + // stage!(tracker, job_duration, "download_repo", "process_repo", item -> + // item.download_repo() + // ), + // ) + // .buffer_unordered(buffer_size * download_buffer_multiplier) + // .filter_map(filter_result!(tracker, "process_repo")) + // .map( + // stage!(tracker, job_duration, "process_repo", "apply_updates", item -> + // item.process_repo() + // ), + // ) + // .buffer_unordered(buffer_size) + // .filter_map(filter_result!(tracker, "apply_updates")) + // .map( + // stage!(tracker, job_duration, "apply_updates", "print_report", item -> + // // println!("Items: {:?}", item.state.updates.len()); + // item.apply_updates() + // ), + // ) + // .buffer_unordered(buffer_size) + // .filter_map(filter_result!(tracker, "print_report")) + // .for_each(|x| async { + // tracker.add( + // -1, + // &[ + // KeyValue::new("stage", "print_report"), + // KeyValue::new("state", "queued"), + // ], + // ); + // tracker.add( + // 1, + // &[ + // KeyValue::new("stage", "print_report"), + // KeyValue::new("state", "active"), + // ], + // ); + // x.print_report().await; + // tracker.add( + // -1, + // &[ + // KeyValue::new("stage", "print_report"), + // KeyValue::new("state", "active"), + // ], + // ); + // repos_indexed.add(1, &[]); + // }) + // .await; + + // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) + // .map(async |did| { + // let db = db.clone(); + // let http_client = http_client.clone(); + + // tokio::task::spawn(process_did(did, db, http_client)).await? + // }) + // .buffered(buffer_size) + // .filter_map(|result| async { + // if let Err(error) = &result { + // error!(target: "indexer", "Failed to index repo: {}", error); + + // return None; + // } + // result.ok() + // }) + // .for_each(|x| async { + // repos_indexed.add(1, &[]); + // }) + // .await; // panic!("Done, this should not happen"); Ok(()) } + +// pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { +// let http_client = Client::new(); + +// let meter = global::meter("indexer"); +// let repos_indexed = meter +// .u64_counter("indexer.repos.indexed") +// .with_description("Total number of indexed repos") +// .with_unit("repo") +// .build(); +// let tracker = meter +// .i64_up_down_counter("indexer.pipeline.location") +// .with_description("Track the number of tasks in the pipeline") +// .with_unit("repo") +// .build(); +// let job_duration = meter +// .u64_histogram("indexer.pipeline.duration") +// .with_unit("ms") +// .with_description("Pipeline job duration") +// .with_boundaries( +// vec![1, 3, 10, 31, 100, 316, 1000, 3160, 10000] +// .iter() +// .map(|x| *x as f64 + 1000.0) +// .collect::>(), +// ) +// .build(); + +// let mut res = db +// .query("SELECT count() as c FROM latest_backfill WHERE at != NONE GROUP ALL;") +// .await +// .unwrap(); +// let count = res.take::>((0, "c")).unwrap().unwrap_or(0); +// if count == 0 { +// warn!("Started with 0 repos, this might be a bug"); +// } +// repos_indexed.add(count as u64, &[]); + +// let buffer_size = ARGS.pipeline_buffer_size; +// let download_buffer_multiplier = ARGS.download_buffer_multiplier; + +// RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) +// .map(|did| async { +// let db = db.clone(); +// let http_client = http_client.clone(); +// let item = PipelineItem::new(db, http_client, did); + +// tracker.add( +// 1, +// &[ +// KeyValue::new("stage", "get_service"), +// KeyValue::new("state", "queued"), +// ], +// ); +// item +// }) +// .buffer_unordered(buffer_size) +// .map( +// stage!(tracker, job_duration, "get_service", "download_repo", item -> +// item.get_service() +// ), +// ) +// .buffer_unordered(buffer_size) +// .filter_map(filter_result!(tracker, "download_repo")) +// .map( +// stage!(tracker, job_duration, "download_repo", "process_repo", item -> +// item.download_repo() +// ), +// ) +// .buffer_unordered(buffer_size * download_buffer_multiplier) +// .filter_map(filter_result!(tracker, "process_repo")) +// .map( +// stage!(tracker, job_duration, "process_repo", "apply_updates", item -> +// item.process_repo() +// ), +// ) +// .buffer_unordered(buffer_size) +// .filter_map(filter_result!(tracker, "apply_updates")) +// .map( +// stage!(tracker, job_duration, "apply_updates", "print_report", item -> +// // println!("Items: {:?}", item.state.updates.len()); +// item.apply_updates() +// ), +// ) +// .buffer_unordered(buffer_size) +// .filter_map(filter_result!(tracker, "print_report")) +// .for_each(|x| async { +// tracker.add( +// -1, +// &[ +// KeyValue::new("stage", "print_report"), +// KeyValue::new("state", "queued"), +// ], +// ); +// tracker.add( +// 1, +// &[ +// KeyValue::new("stage", "print_report"), +// KeyValue::new("state", "active"), +// ], +// ); +// x.print_report().await; +// tracker.add( +// -1, +// &[ +// KeyValue::new("stage", "print_report"), +// KeyValue::new("state", "active"), +// ], +// ); +// repos_indexed.add(1, &[]); +// }) +// .await; + +// // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) +// // .map(async |did| { +// // let db = db.clone(); +// // let http_client = http_client.clone(); + +// // tokio::task::spawn(process_did(did, db, http_client)).await? +// // }) +// // .buffered(buffer_size) +// // .filter_map(|result| async { +// // if let Err(error) = &result { +// // error!(target: "indexer", "Failed to index repo: {}", error); + +// // return None; +// // } +// // result.ok() +// // }) +// // .for_each(|x| async { +// // repos_indexed.add(1, &[]); +// // }) +// // .await; + +// // panic!("Done, this should not happen"); +// Ok(()) +// } + +async fn process_did(did: String, db: Surreal, client: Client) -> anyhow::Result<()> { + let item = PipelineItem::new(db, client, did); + let item = item.get_service().await?; + let item = item.download_repo().await?; + let item = item.process_repo().await?; + let item = item.apply_updates().await?; + item.print_report().await; + Ok(()) +} diff --git a/src/database/repo_indexer/buffered_items.rs b/src/database/repo_indexer/buffered_items.rs new file mode 100644 index 0000000..b9014eb --- /dev/null +++ b/src/database/repo_indexer/buffered_items.rs @@ -0,0 +1,190 @@ +// use crate::stream::{Fuse, FuturesUnordered, StreamExt}; +use core::fmt; +use futures::stream::FuturesUnordered; +use futures::{Sink, Stream, StreamExt}; +use std::pin::Pin; +use std::task::Poll; +// use futures_core::task::{Context, Poll}; +use pin_project_lite::pin_project; +use std::future::Future; +// use std::stream::{FusedStream, Stream}; + +// pin_project! { +/// Stream for the [`buffer_unordered`](super::StreamExt::buffer_unordered) +/// method. +#[must_use = "streams do nothing unless polled"] +pub struct BufferItems +where + St: Stream, +{ + stream: St, + in_progress_queue: FuturesUnordered, + max: usize, +} +// } + +impl fmt::Debug for BufferItems +where + St: Stream + fmt::Debug, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("BufferItems") + .field("stream", &self.stream) + .field("in_progress_queue", &self.in_progress_queue) + .field("max", &self.max) + .finish() + } +} + +impl BufferItems +where + St: Stream, + St::Item: Future, +{ + pub fn new(stream: St, n: usize) -> Self { + Self { + stream: stream, + in_progress_queue: FuturesUnordered::new(), + max: n, + } + } + + // delegate_access_inner!(stream, St, (.)); +} + +impl Stream for BufferItems +where + St: Stream, + St::Item: Future, +{ + type Item = ::Output; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let mut this = self; + + // First up, try to spawn off as many futures as possible by filling up + // our queue of futures. + while this.in_progress_queue.len() < *this.max { + match this.stream.poll_next_unpin(cx) { + Poll::Ready(Some(fut)) => this.in_progress_queue.push(fut), + Poll::Ready(None) | Poll::Pending => break, + } + } + + // Attempt to pull the next value from the in_progress_queue + match this.in_progress_queue.poll_next_unpin(cx) { + x @ Poll::Pending | x @ Poll::Ready(Some(_)) => return x, + Poll::Ready(None) => {} + } + + // If more values are still coming from the stream, we're not done yet + if this.stream.is_done() { + Poll::Ready(None) + } else { + Poll::Pending + } + } + + fn size_hint(&self) -> (usize, Option) { + let queue_len = self.in_progress_queue.len(); + let (lower, upper) = self.stream.size_hint(); + let lower = lower.saturating_add(queue_len); + let upper = match upper { + Some(x) => x.checked_add(queue_len), + None => None, + }; + (lower, upper) + } +} + +// impl FusedStream for BufferItems +// where +// St: Stream, +// St::Item: Future, +// { +// fn is_terminated(&self) -> bool { +// self.in_progress_queue.is_terminated() && self.stream.is_terminated() +// } +// } + +// impl Stream for BufferItems +// where +// St: Stream, +// St::Item: Future, +// { +// type Item = ::Output; + +// fn poll_next( +// mut self: std::pin::Pin<&mut Self>, +// cx: &mut std::task::Context<'_>, +// ) -> Poll> { +// let mut this = self.project(); + +// // First up, try to spawn off as many futures as possible by filling up +// // our queue of futures. +// while this.in_progress_queue.len() < *this.max { +// match this.stream.as_mut().poll_next(cx) { +// Poll::Ready(Some(fut)) => this.in_progress_queue.push(fut), +// Poll::Ready(None) | Poll::Pending => break, +// } +// } + +// // Attempt to pull the next value from the in_progress_queue +// match this.in_progress_queue.poll_next_unpin(cx) { +// x @ Poll::Pending | x @ Poll::Ready(Some(_)) => return x, +// Poll::Ready(None) => {} +// } + +// // If more values are still coming from the stream, we're not done yet +// if this.stream.is_done() { +// Poll::Ready(None) +// } else { +// Poll::Pending +// } +// } + +// fn size_hint(&self) -> (usize, Option) { +// let queue_len = self.in_progress_queue.len(); +// let (lower, upper) = self.stream.size_hint(); +// let lower = lower.saturating_add(queue_len); +// let upper = match upper { +// Some(x) => x.checked_add(queue_len), +// None => None, +// }; +// (lower, upper) +// } +// } + +// Forwarding impl of Sink from the underlying stream +// impl Sink for BufferItems +// where +// S: Stream + Sink, +// S::Item: Future, +// { +// type Error = S::Error; + +// fn poll_ready( +// self: Pin<&mut Self>, +// cx: &mut core::task::Context<'_>, +// ) -> core::task::Poll> { +// self.project().stream.poll_ready(cx) +// } +// fn start_send(self: Pin<&mut Self>, item: Item) -> Result<(), Self::Error> { +// self.project().stream.start_send(item) +// } +// fn poll_flush( +// self: Pin<&mut Self>, +// cx: &mut core::task::Context<'_>, +// ) -> core::task::Poll> { +// self.project().stream.poll_flush(cx) +// } +// fn poll_close( +// self: Pin<&mut Self>, +// cx: &mut core::task::Context<'_>, +// ) -> core::task::Poll> { +// self.project().stream.poll_close(cx) +// } +// } diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 876e884..0677091 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -2,7 +2,7 @@ use crate::{ config::ARGS, database::{ definitions::Record, - handlers::{apply_big_update, on_commit_event_createorupdate, BigUpdate}, + handlers::{on_commit_event_createorupdate, BigUpdate}, }, }; use atrium_api::{ @@ -13,8 +13,14 @@ use atrium_api::{ use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{collections::HashMap, io::Read, string::FromUtf8Error, sync::LazyLock, time::Duration}; -use surrealdb::{engine::any::Any, opt::PatchOp, Surreal}; +use std::{ + collections::HashMap, + io::Read, + string::FromUtf8Error, + sync::LazyLock, + time::{Duration, Instant}, +}; +use surrealdb::{engine::any::Any, opt::PatchOp, RecordId, Surreal}; use tokio::task::spawn_blocking; use tracing::{info, instrument, span, trace, warn, Level, Span}; @@ -133,6 +139,7 @@ async fn get_plc_service( ) -> anyhow::Result> { let resp = http_client .get(format!("https://plc.directory/{}", did)) + .timeout(Duration::from_secs(ARGS.directory_download_timeout)) .send() .await? .json::() @@ -152,6 +159,7 @@ async fn download_repo( "{}/xrpc/com.atproto.sync.getRepo?did={}", service.service_endpoint, did, )) + .timeout(tokio::time::Duration::from_secs(ARGS.repo_download_timeout)) .send() .await?; let bytes = get_repo_response.bytes().await?.to_vec(); @@ -180,78 +188,51 @@ fn deserialize_repo(mut bytes: Vec) -> anyhow::Result>> files } -/// Convert downloaded files into database updates -#[instrument(skip_all)] -async fn files_to_updates(files: HashMap>) -> anyhow::Result> { - // TODO: Look into using block_in_place instead of spawn_blocking - let result = spawn_blocking(|| files_to_updates_blocking(files)).await??; - Ok(result) -} - /// Apply updates to the database #[instrument(skip_all)] -async fn apply_updates( - db: &Surreal, - did: &str, - updates: Vec, - update_timestamp: &Duration, -) -> anyhow::Result<()> { +fn create_big_update(did: &str, updates: Vec) -> anyhow::Result { let did_key = crate::database::utils::did_to_key(did)?; - if !ARGS.dont_write_when_backfilling.unwrap_or(false) { - let did = did.to_owned(); - let did_key = did_key.to_owned(); - let big_update = tokio::task::spawn_blocking(move || { - let mut futures = updates.into_iter().map(|update| { - let did_key = did_key.clone(); - let did = did.to_string(); - - let res = on_commit_event_createorupdate( - Did::new(did.clone().into()).unwrap(), - did_key, - update.collection, - update.rkey, - update.record, - ); - - match res { - Ok(big_update) => { - return Ok(big_update); - } - Err(e) => { - warn!("on_commit_event_createorupdate {} {}", e, did); - return Err(e); - } - } - }); - let mut really_big_update = BigUpdate::default(); - loop { - let Some(result) = futures.next() else { - break; - }; - match result { - Ok(big_update) => { - really_big_update.merge(big_update); - } - Err(e) => { - warn!("Failed to apply update: {}", e); - return Err(e); - } - } + let did = did.to_owned(); + let did_key = did_key.to_owned(); + + let mut db_updates = updates.into_iter().map(|update| { + let did_key = did_key.clone(); + let did = did.to_string(); + + let res = on_commit_event_createorupdate( + Did::new(did.clone().into()).unwrap(), + did_key, + update.collection, + update.rkey, + update.record, + ); + + match res { + Ok(big_update) => { + return Ok(big_update); } - Ok(really_big_update) - }) - .await??; - apply_big_update(db, big_update).await?; + Err(e) => { + warn!("on_commit_event_createorupdate {} {}", e, did); + return Err(e); + } + } + }); + let mut really_big_update = BigUpdate::default(); + loop { + let Some(result) = db_updates.next() else { + break; + }; + match result { + Ok(big_update) => { + really_big_update.merge(big_update); + } + Err(e) => { + warn!("Failed to apply update: {}", e); + return Err(e); + } + } } - let _: Option = db - .update(("latest_backfill", did_key.clone())) - .patch(PatchOp::replace( - "at", - surrealdb::sql::Datetime::from(chrono::Utc::now()), - )) - .await?; - - Ok(()) + Ok(really_big_update) } // /// Indexes the repo with the given DID (Decentralized Identifier) @@ -284,8 +265,6 @@ async fn apply_updates( /// No processing has been done on this item pub struct New {} -/// It was verified that the item is not indexed yet -pub struct NotIndexed {} /// Has a service pub struct WithService { service: PlcDirectoryDidResponseService, @@ -298,14 +277,10 @@ pub struct WithRepo { repo: Vec, } -pub struct WithFiles { - now: std::time::Duration, - files: HashMap>, -} /// Has converted the files to update pub struct WithUpdates { now: std::time::Duration, - pub updates: Vec, + pub update: BigUpdate, } /// Updates have been applied pub struct Done {} @@ -336,21 +311,6 @@ impl PipelineItem { } impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn check_indexed(self) -> anyhow::Result> { - // TODO: Obsolete, remove this - - Ok(PipelineItem:: { - state: NotIndexed {}, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - } -} - -impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn get_service(self) -> anyhow::Result> { let service = get_plc_service(&self.http_client, &self.did).await?; @@ -392,30 +352,24 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn deserialize_repo(self) -> anyhow::Result> { + pub async fn process_repo(self) -> anyhow::Result> { // info!("Deserializing repo {}", self.did); - let files = spawn_blocking(|| deserialize_repo(self.state.repo)).await??; - Ok(PipelineItem:: { - state: WithFiles { - now: self.state.now, - files, - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, + let did = self.did.clone(); + let big_update = spawn_blocking(move || { + let files: HashMap, Vec> = + deserialize_repo(self.state.repo)?; + let updates = files_to_updates_blocking(files)?; + let mut big_update = create_big_update(&did, updates)?; + + big_update.add_timestamp(&did, surrealdb::sql::Datetime::from(chrono::Utc::now())); + Result::::Ok(big_update) }) - } -} + .await??; -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn files_to_updates(self) -> anyhow::Result> { - let updates = files_to_updates(self.state.files).await?; Ok(PipelineItem:: { state: WithUpdates { now: self.state.now, - updates, + update: big_update, }, db: self.db, http_client: self.http_client, @@ -428,7 +382,25 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn apply_updates(self) -> anyhow::Result> { - apply_updates(&self.db, &self.did, self.state.updates, &self.state.now).await?; + let start = Instant::now(); + + if !ARGS.dont_write_when_backfilling.unwrap_or(false) { + self.state.update.apply(&self.db).await?; + } else { + eprintln!("Skipping writing to the database and sleeping instead"); + std::thread::sleep(Duration::from_secs(1)); + } + let duration = start.elapsed(); + eprintln!("Big update took {:?}", duration); + warn!("Big update took {:?}", duration); + // let _: Option = &self + // .db + // .update(("latest_backfill", did_key.clone())) + // .patch(PatchOp::replace( + // "at", + // surrealdb::sql::Datetime::from(chrono::Utc::now()), + // )) + // .await?; Ok(PipelineItem:: { state: Done {}, db: self.db, @@ -441,8 +413,9 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn print_report(self) -> () { + pub async fn print_report(self) -> anyhow::Result<()> { // TODO: This is only for printing debug stuff trace!("Indexed {}", self.did); + Ok(()) } } diff --git a/src/main.rs b/src/main.rs index 6ee2856..7daaec2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,7 +38,8 @@ fn main() { .enable_time() .enable_io() .max_io_events_per_tick(1024 * 512) - .global_queue_interval(20) + .global_queue_interval(40) + .event_interval(20) .thread_name_fn(|| { static ATOMIC: AtomicUsize = AtomicUsize::new(0); let id = ATOMIC.fetch_add(1, Ordering::Relaxed); diff --git a/src/observability.rs b/src/observability.rs index 0296194..23e8c51 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -89,22 +89,22 @@ pub async fn init_observability() -> Arc { let meter_provider = init_meter(); let logger_provider = init_logger(); - // Exports tokio stats for tokio-console - let tokio_console_enabled = ARGS.console.unwrap_or(false); - let tokio_console_filter = FilterFn::new(move |_| tokio_console_enabled); - let tokio_console_layer = console_subscriber::spawn().with_filter(tokio_console_filter); + // // Exports tokio stats for tokio-console + // let tokio_console_enabled = ARGS.console.unwrap_or(false); + // let tokio_console_filter = FilterFn::new(move |_| tokio_console_enabled); + // let tokio_console_layer = console_subscriber::spawn().with_filter(tokio_console_filter); - // Prints logs to stdout - let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); - let stdout_layer = tracing_subscriber::fmt::layer() - .with_thread_names(true) - .with_filter(stdout_filter); + // // Prints logs to stdout + // let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + // let stdout_layer = tracing_subscriber::fmt::layer() + // .with_thread_names(true) + // .with_filter(stdout_filter); // Add all layers - let registry = tracing_subscriber::registry() - .with(stdout_layer) - .with(tokio_console_layer); - if ARGS.otel.unwrap_or(true) { + let registry = tracing_subscriber::registry(); + // .with(stdout_layer) + // .with(tokio_console_layer); + if ARGS.otel_logs.unwrap_or(true) { // Exports logs to otel let otel_log_filter = EnvFilter::new("info") .add_directive("hyper=off".parse().unwrap()) @@ -121,6 +121,7 @@ pub async fn init_observability() -> Arc { .with(tracing_opentelemetry::MetricsLayer::new( meter_provider.clone(), )); + if ARGS.otel_tracing.unwrap_or(true) { // Exports tracing traces to opentelemetry let tracing_filter = EnvFilter::new("info") @@ -137,7 +138,21 @@ pub async fn init_observability() -> Arc { registry_with_otel.init(); }; } else { - registry.init(); + if ARGS.otel_tracing.unwrap_or(true) { + // Exports tracing traces to opentelemetry + let tracing_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let tracer = tracer_provider.tracer("tracing-otel-subscriber"); + let tracing_layer = + tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); + registry.with(tracing_layer).init(); + } else { + registry.init(); + }; }; // TODO: Replace this hacky mess with something less broken @@ -147,64 +162,68 @@ pub async fn init_observability() -> Arc { logger_provider, }); let handler_otel_guard = guard.clone(); - tokio::spawn(async move { - ctrl_c().await.unwrap(); - eprintln!("Preparing for unclean exit"); - - handler_otel_guard.logger_provider.shutdown().unwrap(); - handler_otel_guard.meter_provider.shutdown().unwrap(); - handler_otel_guard.tracer_provider.shutdown().unwrap(); - tokio::time::sleep(std::time::Duration::from_secs(1)).await; - - eprintln!("Exiting"); - exit(1); - }); + tokio::task::Builder::new() + .name("Observability shutdown hook") + .spawn(async move { + ctrl_c().await.unwrap(); + eprintln!("Preparing for unclean exit"); + + handler_otel_guard.logger_provider.shutdown().unwrap(); + handler_otel_guard.meter_provider.shutdown().unwrap(); + handler_otel_guard.tracer_provider.shutdown().unwrap(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + eprintln!("Exiting"); + exit(1); + }) + .unwrap(); guard } fn init_logger() -> SdkLoggerProvider { - let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); - let logger_provider = SdkLoggerProvider::builder() - .with_resource(RESOURCE.clone()) - .with_batch_exporter(otlp_log_exporter) - .build(); - logger_provider + let mut logger_provider = SdkLoggerProvider::builder().with_resource(RESOURCE.clone()); + if ARGS.otel_logs.unwrap_or(true) { + let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); + logger_provider = logger_provider.with_batch_exporter(otlp_log_exporter); + }; + logger_provider.build() } fn init_meter() -> SdkMeterProvider { - let otlp_metric_exporter = MetricExporter::builder() - .with_tonic() - .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) - .build() - .unwrap(); - - let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) - .with_interval(std::time::Duration::from_secs(5)) - .build(); - - let meter_provider = SdkMeterProvider::builder() - .with_resource(RESOURCE.clone()) - .with_reader(periodic_reader) - .build(); + let mut meter_provider_builder = SdkMeterProvider::builder().with_resource(RESOURCE.clone()); + if ARGS.otel_metrics.unwrap_or(true) { + let otlp_metric_exporter = MetricExporter::builder() + .with_tonic() + .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) + .build() + .unwrap(); + + let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) + .with_interval(std::time::Duration::from_secs(5)) + .build(); + + meter_provider_builder = meter_provider_builder.with_reader(periodic_reader); + } + let meter_provider = meter_provider_builder.build(); global::set_meter_provider(meter_provider.clone()); - meter_provider } fn init_tracer() -> SdkTracerProvider { global::set_text_map_propagator(TraceContextPropagator::new()); + let mut tracer_provider_builder = SdkTracerProvider::builder().with_resource(RESOURCE.clone()); + if ARGS.otel_tracing.unwrap_or(true) { + let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); + + tracer_provider_builder = tracer_provider_builder + .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( + 1.0, + )))) + .with_id_generator(RandomIdGenerator::default()) + .with_batch_exporter(otlp_span_exporter); + } - let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); - - let tracer_provider = SdkTracerProvider::builder() - .with_resource(RESOURCE.clone()) - .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( - 1.0, - )))) - .with_id_generator(RandomIdGenerator::default()) - .with_batch_exporter(otlp_span_exporter) - // .with_simple_exporter(otlp_span_exporter) - .build(); + let tracer_provider = tracer_provider_builder.build(); global::set_tracer_provider(tracer_provider.clone()); tracer_provider From f385f7cbe618adb93503c252d54868011ba9bbcc Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 4 Mar 2025 20:47:11 +0100 Subject: [PATCH 44/75] Support multiple database connections --- src/config.rs | 4 ++-- src/database/repo_indexer.rs | 22 +++++++++++++++++++--- src/main.rs | 2 +- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/config.rs b/src/config.rs index d9de1c0..21e70e3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -15,8 +15,8 @@ pub struct Args { #[arg(long)] pub max_tasks: Option, /// Endpoint of the database server (including port and protocol) - #[arg(short = 'D', long, default_value = "rocksdb://path/to/surreal.db")] - pub db: String, + #[arg(short = 'D', long, num_args=1..=16)] + pub db: Vec, /// Username for the database server #[arg(short, long, default_value = "root")] pub username: String, diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 253a0a3..fb5332e 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,6 +1,7 @@ -use std::{os::unix::process, sync::LazyLock}; +use std::{ops::Rem, os::unix::process, sync::LazyLock}; -use futures::StreamExt; +use atrium_api::com::atproto::repo; +use futures::{stream::FuturesUnordered, StreamExt}; use index_repo::PipelineItem; use opentelemetry::{global, KeyValue}; use pumps::Concurrency; @@ -12,6 +13,8 @@ use tracing::{error, warn}; use crate::config::ARGS; +use super::connect; + // mod buffered_items; mod index_repo; mod repo_stream; @@ -247,8 +250,21 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let download_concurrency_multiplier = ARGS.pipeline_download_concurrency_multiplier; let concurrent_elements = ARGS.pipeline_concurrent_elements; + let databases = ARGS + .db + .iter() + .map(|x| async { connect(x, &ARGS.username, &ARGS.password).await.unwrap() }) + .collect::>() + .collect::>() + .await; let repo_stream = RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()); - let dids = repo_stream.map(move |x| (x.to_string(), db.clone(), http_client.clone())); + let dids = repo_stream.enumerate().map(move |(id, x)| { + ( + x.to_string(), + databases.get(id.rem(databases.len())).unwrap().clone(), + http_client.clone(), + ) + }); // let urls_list = vec!["1"; 1000]; // let urls = urls_list diff --git a/src/main.rs b/src/main.rs index 7daaec2..af802a6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -68,7 +68,7 @@ async fn application_main() -> anyhow::Result<()> { let _otel_guard = init_observability().await; // connect to the database - let db = database::connect(&ARGS.db, &ARGS.username, &ARGS.password) + let db = database::connect(&ARGS.db.first().unwrap(), &ARGS.username, &ARGS.password) .await .context("Failed to connect to the database")?; From f7cffa470768ba5256cde3a47b3f93ce39bdc72b Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 4 Mar 2025 21:41:50 +0100 Subject: [PATCH 45/75] Fix pipeline state metrics --- src/database/repo_indexer.rs | 145 ++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 72 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index fb5332e..e3d7c94 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -95,84 +95,85 @@ macro_rules! done { macro_rules! pump_stage { ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $function:ident) => { |x| async { - eprintln!("starting {}", $stage); - - // TODO: Dont create new keyvalues every time - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "queued"), - ], - ); - $metric.add( - 1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; - let before = std::time::Instant::now(); - let result = tokio::task::spawn(tokio::time::timeout( - tokio::time::Duration::from_secs(ARGS.pipeline_stage_timeout), - x.$function(), - )) - .await; - let duration = before.elapsed(); - eprintln!( - "pre finished {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); - let Ok(result) = result else { - panic!("Spawn error in {}", $stage); - }; - let Ok(result) = result else { - panic!("Timeout in {}", $stage); - }; - - // $perfmetric.record( - // duration.as_millis() as u64, - // &[KeyValue::new("stage", $stage)], - // ); - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - - let result = match result { - Err(error) => { - eprintln!( - "failed {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); + tokio::task::spawn(async move { + eprintln!("starting {}", $stage); - // error!(target: "indexer", "Failed to index repo: {}", error); - return None; - } - Ok(result) => result, - }; - - if $next != "done" { + // TODO: Dont create new keyvalues every time $metric.add( - 1, + -1, &[ - KeyValue::new("stage", $next), + KeyValue::new("stage", $stage), KeyValue::new("state", "queued"), ], ); - } - eprintln!( - "finished {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); - return Some(result); + $metric.add( + 1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; + let before = std::time::Instant::now(); + let result = tokio::time::timeout( + tokio::time::Duration::from_secs(ARGS.pipeline_stage_timeout), + x.$function(), + ) + .await; + let duration = before.elapsed(); + eprintln!( + "pre finished {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + let Ok(result) = result else { + panic!("Timeout in {}", $stage); + }; + + // $perfmetric.record( + // duration.as_millis() as u64, + // &[KeyValue::new("stage", $stage)], + // ); + $metric.add( + -1, + &[ + KeyValue::new("stage", $stage), + KeyValue::new("state", "active"), + ], + ); + + let result = match result { + Err(error) => { + eprintln!( + "failed {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + + // error!(target: "indexer", "Failed to index repo: {}", error); + return None; + } + Ok(result) => result, + }; + + if $next != "done" { + $metric.add( + 1, + &[ + KeyValue::new("stage", $next), + KeyValue::new("state", "queued"), + ], + ); + } + eprintln!( + "finished {} in {:02}", + $stage, + duration.as_millis() as f64 / 1000.0 + ); + return Some(result); + }) + .await + .expect("Failed to spawn task in a pump stage") } }; } From f49960f28ccc325c0dc1128a8ca6e389c5af167b Mon Sep 17 00:00:00 2001 From: Zebreus Date: Tue, 4 Mar 2025 22:20:44 +0100 Subject: [PATCH 46/75] Bundle all updates into a single transaction --- src/database/handlers.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 0230c89..9bce49d 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -336,6 +336,7 @@ impl BigUpdate { let format_output = tokio::task::block_in_place(|| format!("{:?}", &self)); //TODO: Bundle this into a function let query_string = r#" + BEGIN; INSERT IGNORE INTO did $dids RETURN NONE; INSERT IGNORE INTO latest_backfill $latest_backfills RETURN NONE; INSERT IGNORE INTO feed $feeds RETURN NONE; @@ -357,6 +358,7 @@ impl BigUpdate { INSERT RELATION INTO replies $replies_relations RETURN NONE; INSERT RELATION INTO follow $follows RETURN NONE; INSERT INTO latest_backfill $overwrite_latest_backfill RETURN NONE; + COMMIT; "#; let before_update = Instant::now(); From 9df85f0a54f9a8bbcfc769cb9bbf549331d99e3b Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 11:41:09 +0100 Subject: [PATCH 47/75] Delete unused files --- .../repo_indexer/repo_stream_nofuture.rs | 146 ------------------ src/log.rs | 35 ----- 2 files changed, 181 deletions(-) delete mode 100644 src/database/repo_indexer/repo_stream_nofuture.rs delete mode 100644 src/log.rs diff --git a/src/database/repo_indexer/repo_stream_nofuture.rs b/src/database/repo_indexer/repo_stream_nofuture.rs deleted file mode 100644 index 4792629..0000000 --- a/src/database/repo_indexer/repo_stream_nofuture.rs +++ /dev/null @@ -1,146 +0,0 @@ -use std::{ - collections::{HashSet, VecDeque}, - future::{Future, IntoFuture}, - task::Poll, -}; - -use futures::Stream; -use surrealdb::{engine::any::Any, Surreal}; -use tracing::info; - -use crate::database::{repo_indexer::BskyFollowRes, utils::unsafe_user_key_to_did}; - -pub struct RepoStream<'a> { - buffer: VecDeque, - processed_dids: HashSet, - anchor: String, - db: &'a Surreal, - db_future: Option< - std::pin::Pin< - Box< - dyn Future> - + Send - + Sync - + 'a, - >, - >, - >, -} - -impl<'a> RepoStream<'a> { - pub fn new(anchor: String, db: &'a Surreal) -> Self { - return Self { - buffer: VecDeque::new(), - processed_dids: HashSet::new(), - anchor, - db, - db_future: None, - }; - } -} - -const FETCH_AMOUNT: usize = 100; - -// async fn get_repos_from(db: &Surreal, anchor: &str) -> Vec { -// info!(target: "indexer", "Discovering follows starting from {}", anchor); -// let mut result = db -// // TODO: Fix the possible SQL injection -// .query(format!( -// "SELECT id,in,out FROM follow:{}.. LIMIT {};", -// anchor, FETCH_AMOUNT -// )); -// let follows: Vec = result.take(0)?; - -// let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { -// sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; -// continue; -// }; -// } - -impl<'a> Stream for RepoStream<'a> { - type Item = String; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - loop { - if let Some(next) = self.buffer.pop_front() { - return Poll::Ready(Some(next)); - } - - info!(target: "indexer", "Discovering follows starting from {}", self.anchor); - let db_future = if self.db_future.is_some() { - self.db_future.as_mut().unwrap() - } else { - let mut result = self - .db - // TODO: Fix the possible SQL injection - .query(format!( - "SELECT id,in,out FROM follow:{}.. LIMIT {};", - self.anchor, FETCH_AMOUNT - )); - // let mut future: std::pin::Pin< - // Box< - // dyn Future> - // + Send - // + Sync - // + 'a, - // >, - // > - let mut future = result.into_future(); - self.db_future = Some(future); - self.db_future.as_mut().unwrap() - }; - - let Poll::Ready(result) = Future::poll(db_future.as_mut(), cx) else { - return Poll::Pending; - }; - self.db_future = None; - - let mut result = result.unwrap(); - - // let mut result: surrealdb::method::Query<'_, Any> = self - // .db - // // TODO: Fix the possible SQL injection - // .query(format!( - // "SELECT id,in,out FROM follow:{}.. LIMIT {};", - // self.anchor, FETCH_AMOUNT - // )); - let follows: Vec = result.take(0).unwrap(); - - let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { - // sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; - // continue; - // TODO: Sleep again - return Poll::Pending; - }; - self.anchor = format!("{}", anchor_key); - - for follow in &follows { - for record_id in [&follow.from, &follow.to] { - let did = unsafe_user_key_to_did(&format!("{}", record_id.key())); - if self.processed_dids.contains(&did) { - continue; - } - self.processed_dids.insert(did.clone()); - self.buffer.push_back(did); - // tx.send(did) - // .await - // .context("Failed to send message to handler thread")?; - } - } - - if let Some(next) = self.buffer.pop_front() { - return Poll::Ready(Some(next)); - } - return Poll::Pending; - - // Warn if it looks like the queue size or the backoff were choosen incorrectly - // let new_follows = self.processed_dids.len() - processed_dids_before; - // if new_follows != 0 && follows.len() == fetch_amount && tx.len() < warning_threshold { - // warn!(target: "indexer", "Queue is not getting filled up fast enough. Consider increasing the queue size or decreasing the backoff."); - // } - } - } -} diff --git a/src/log.rs b/src/log.rs deleted file mode 100644 index 0226127..0000000 --- a/src/log.rs +++ /dev/null @@ -1,35 +0,0 @@ -use chrono::Local; -use colog::{format::CologStyle, formatter}; -use colored::Colorize; -use log::{Level, LevelFilter}; - -/// Custom log style for colog -struct LogStyle; - -impl CologStyle for LogStyle { - fn prefix_token(&self, level: &Level) -> String { - // convert log level to colored string - let prefix = match level { - Level::Error => "E".red(), - Level::Warn => "W".yellow(), - Level::Info => "*".green(), - Level::Debug => "D".blue(), - Level::Trace => "T".purple(), - }; - - // format current time - let time = Local::now().format("%d.%m.%Y %H:%M:%S"); - - // return formatted log prefix - format!("[{}] [{}]", time, prefix) - } -} - -/// Initialize the logging system -pub fn init(level: LevelFilter) { - colog::default_builder() - .filter_level(LevelFilter::Off) - .filter_module("indexer", level) - .format(formatter(LogStyle)) - .init(); -} From 366a8e403fca30cae7faae4d64045ca353f9ebde Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 14:04:55 +0100 Subject: [PATCH 48/75] Clean up observability initialization --- src/observability.rs | 267 ++++---------------------- src/observability/otel_providers.rs | 283 ++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 228 deletions(-) create mode 100644 src/observability/otel_providers.rs diff --git a/src/observability.rs b/src/observability.rs index 23e8c51..c0a5e9a 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -1,251 +1,62 @@ -use opentelemetry::{global, trace::TracerProvider as _, KeyValue}; -use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; -use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; -use opentelemetry_resource_detectors::{ - HostResourceDetector, OsResourceDetector, ProcessResourceDetector, -}; -use opentelemetry_sdk::{ - logs::SdkLoggerProvider, - metrics::{PeriodicReader, SdkMeterProvider}, - propagation::TraceContextPropagator, - resource::EnvResourceDetector, - trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, - Resource, -}; -use opentelemetry_semantic_conventions::{ - attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, - resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION}, - SCHEMA_URL, -}; -use std::{ - process::exit, - sync::{Arc, LazyLock}, -}; +use crate::config::ARGS; +use console_subscriber::ConsoleLayer; +use otel_providers::OtelProviders; +use std::{process::exit, sync::Arc}; use tokio::signal::ctrl_c; +use tracing::Subscriber; use tracing_subscriber::{ - filter::FilterFn, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer, + layer::SubscriberExt, registry::LookupSpan, util::SubscriberInitExt, EnvFilter, Layer, }; -use crate::config::ARGS; - -const RESOURCE: LazyLock = LazyLock::new(|| { - // let instance_id = Uuid::new_v4(); +mod otel_providers; - let mut attributes = vec![ - KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), - KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - // KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), - KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), - ]; - - if let Ok(linux_sys_info) = sys_info::linux_os_release() { - if let Some(build_id) = linux_sys_info.build_id { - attributes.push(KeyValue::new(OS_BUILD_ID, build_id)); - } - if let Some(pretty_name) = linux_sys_info.pretty_name { - attributes.push(KeyValue::new(OS_DESCRIPTION, pretty_name)); - } - if let Some(name) = linux_sys_info.name { - attributes.push(KeyValue::new(OS_NAME, name)); - } - if let Some(version_id) = linux_sys_info.version_id { - attributes.push(KeyValue::new(OS_VERSION, version_id)); - } - } else { - if let Ok(os_version) = sys_info::os_release() { - attributes.push(KeyValue::new(OS_DESCRIPTION, os_version)); - } - if let Ok(os_name) = sys_info::os_type() { - attributes.push(KeyValue::new(OS_NAME, os_name)); - } +/// Layer for enabling tokio-console +pub fn tokio_console_layer() -> Option> +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ + if !ARGS.console.unwrap_or(false) { + return None; } + Some(ConsoleLayer::builder().with_default_env().spawn()) +} - if let Ok(hostname) = sys_info::hostname() { - attributes.push(KeyValue::new(HOST_NAME, hostname)); - } - - Resource::builder() - .with_schema_url( - [ - KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), - KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), - ], - SCHEMA_URL, - ) - .with_attributes(attributes) - .with_detectors(&[ - Box::new(EnvResourceDetector::new()), - Box::new(HostResourceDetector::default()), - Box::new(ProcessResourceDetector), - Box::new(OsResourceDetector), - // Box::new(OsResourceDetector::new()), - ]) - .build() -}); - -pub async fn init_observability() -> Arc { - let tracer_provider = init_tracer(); - let meter_provider = init_meter(); - let logger_provider = init_logger(); - - // // Exports tokio stats for tokio-console - // let tokio_console_enabled = ARGS.console.unwrap_or(false); - // let tokio_console_filter = FilterFn::new(move |_| tokio_console_enabled); - // let tokio_console_layer = console_subscriber::spawn().with_filter(tokio_console_filter); - - // // Prints logs to stdout - // let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); - // let stdout_layer = tracing_subscriber::fmt::layer() - // .with_thread_names(true) - // .with_filter(stdout_filter); - - // Add all layers - let registry = tracing_subscriber::registry(); - // .with(stdout_layer) - // .with(tokio_console_layer); - if ARGS.otel_logs.unwrap_or(true) { - // Exports logs to otel - let otel_log_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let otel_log_layer = - OpenTelemetryTracingBridge::new(&logger_provider).with_filter(otel_log_filter); +/// Layer for stdout +pub fn stdout_layer() -> impl Layer +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ + let stdout_filter = EnvFilter::new("info").add_directive("opentelemetry=info".parse().unwrap()); + let stdout_layer = tracing_subscriber::fmt::layer() + .with_thread_names(true) + .with_filter(stdout_filter); + Box::new(stdout_layer) +} - let registry_with_otel = - registry - .with(otel_log_layer) - .with(tracing_opentelemetry::MetricsLayer::new( - meter_provider.clone(), - )); +pub async fn init_observability() -> Arc { + let otel_providers = Arc::new(OtelProviders::new()); - if ARGS.otel_tracing.unwrap_or(true) { - // Exports tracing traces to opentelemetry - let tracing_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let tracer = tracer_provider.tracer("tracing-otel-subscriber"); - let tracing_layer = - tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); - registry_with_otel.with(tracing_layer).init(); - } else { - registry_with_otel.init(); - }; - } else { - if ARGS.otel_tracing.unwrap_or(true) { - // Exports tracing traces to opentelemetry - let tracing_filter = EnvFilter::new("info") - .add_directive("hyper=off".parse().unwrap()) - .add_directive("h2=off".parse().unwrap()) - .add_directive("opentelemetry=off".parse().unwrap()) - .add_directive("tonic=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()); - let tracer = tracer_provider.tracer("tracing-otel-subscriber"); - let tracing_layer = - tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); - registry.with(tracing_layer).init(); - } else { - registry.init(); - }; - }; + // Initialize the tracing subscribers + tracing_subscriber::registry() + .with(stdout_layer()) + .with(tokio_console_layer()) + .with(otel_providers.tracing_layers()) + .init(); - // TODO: Replace this hacky mess with something less broken - let guard = Arc::new(OtelGuard { - tracer_provider, - meter_provider, - logger_provider, - }); - let handler_otel_guard = guard.clone(); + let handler_otel_providers = otel_providers.clone(); tokio::task::Builder::new() .name("Observability shutdown hook") .spawn(async move { + // TODO: Properly manage application shutdown ctrl_c().await.unwrap(); eprintln!("Preparing for unclean exit"); - handler_otel_guard.logger_provider.shutdown().unwrap(); - handler_otel_guard.meter_provider.shutdown().unwrap(); - handler_otel_guard.tracer_provider.shutdown().unwrap(); + handler_otel_providers.shutdown(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; eprintln!("Exiting"); exit(1); }) .unwrap(); - guard -} - -fn init_logger() -> SdkLoggerProvider { - let mut logger_provider = SdkLoggerProvider::builder().with_resource(RESOURCE.clone()); - if ARGS.otel_logs.unwrap_or(true) { - let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); - logger_provider = logger_provider.with_batch_exporter(otlp_log_exporter); - }; - logger_provider.build() -} - -fn init_meter() -> SdkMeterProvider { - let mut meter_provider_builder = SdkMeterProvider::builder().with_resource(RESOURCE.clone()); - if ARGS.otel_metrics.unwrap_or(true) { - let otlp_metric_exporter = MetricExporter::builder() - .with_tonic() - .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) - .build() - .unwrap(); - - let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) - .with_interval(std::time::Duration::from_secs(5)) - .build(); - - meter_provider_builder = meter_provider_builder.with_reader(periodic_reader); - } - let meter_provider = meter_provider_builder.build(); - global::set_meter_provider(meter_provider.clone()); - meter_provider -} - -fn init_tracer() -> SdkTracerProvider { - global::set_text_map_propagator(TraceContextPropagator::new()); - let mut tracer_provider_builder = SdkTracerProvider::builder().with_resource(RESOURCE.clone()); - if ARGS.otel_tracing.unwrap_or(true) { - let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); - - tracer_provider_builder = tracer_provider_builder - .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( - 1.0, - )))) - .with_id_generator(RandomIdGenerator::default()) - .with_batch_exporter(otlp_span_exporter); - } - - let tracer_provider = tracer_provider_builder.build(); - global::set_tracer_provider(tracer_provider.clone()); - - tracer_provider -} - -pub struct OtelGuard { - tracer_provider: SdkTracerProvider, - meter_provider: SdkMeterProvider, - logger_provider: SdkLoggerProvider, -} - -impl Drop for OtelGuard { - fn drop(&mut self) { - eprintln!("Shutting down observability"); - if let Err(err) = self.tracer_provider.shutdown() { - eprintln!("{err:?}"); - } - if let Err(err) = self.meter_provider.shutdown() { - eprintln!("{err:?}"); - } - if let Err(err) = self.logger_provider.shutdown() { - eprintln!("{err:?}"); - } - } + otel_providers } diff --git a/src/observability/otel_providers.rs b/src/observability/otel_providers.rs new file mode 100644 index 0000000..3c84fbb --- /dev/null +++ b/src/observability/otel_providers.rs @@ -0,0 +1,283 @@ +use crate::config::ARGS; +use opentelemetry::trace::TracerProvider; +use opentelemetry::{global, KeyValue}; +use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; +use opentelemetry_otlp::{LogExporter, MetricExporter, SpanExporter}; +use opentelemetry_resource_detectors::{ + HostResourceDetector, OsResourceDetector, ProcessResourceDetector, +}; +use opentelemetry_sdk::{ + logs::SdkLoggerProvider, + metrics::{PeriodicReader, SdkMeterProvider}, + propagation::TraceContextPropagator, + resource::EnvResourceDetector, + trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, + Resource, +}; +use opentelemetry_semantic_conventions::{ + attribute::{DEPLOYMENT_ENVIRONMENT_NAME, SERVICE_NAME, SERVICE_VERSION}, + resource::{HOST_NAME, OS_BUILD_ID, OS_DESCRIPTION, OS_NAME, OS_VERSION}, + SCHEMA_URL, +}; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + LazyLock, Mutex, +}; +use tracing::Subscriber; +use tracing_subscriber::{registry::LookupSpan, EnvFilter, Layer}; + +const RESOURCE: LazyLock = LazyLock::new(|| { + // let instance_id = Uuid::new_v4(); + + let mut attributes = vec![ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + // KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ]; + + if let Ok(linux_sys_info) = sys_info::linux_os_release() { + if let Some(build_id) = linux_sys_info.build_id { + attributes.push(KeyValue::new(OS_BUILD_ID, build_id)); + } + if let Some(pretty_name) = linux_sys_info.pretty_name { + attributes.push(KeyValue::new(OS_DESCRIPTION, pretty_name)); + } + if let Some(name) = linux_sys_info.name { + attributes.push(KeyValue::new(OS_NAME, name)); + } + if let Some(version_id) = linux_sys_info.version_id { + attributes.push(KeyValue::new(OS_VERSION, version_id)); + } + } else { + if let Ok(os_version) = sys_info::os_release() { + attributes.push(KeyValue::new(OS_DESCRIPTION, os_version)); + } + if let Ok(os_name) = sys_info::os_type() { + attributes.push(KeyValue::new(OS_NAME, os_name)); + } + } + + if let Ok(hostname) = sys_info::hostname() { + attributes.push(KeyValue::new(HOST_NAME, hostname)); + } + + Resource::builder() + .with_schema_url( + [ + KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), + KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), + KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), + ], + SCHEMA_URL, + ) + .with_attributes(attributes) + .with_detectors(&[ + Box::new(EnvResourceDetector::new()), + Box::new(HostResourceDetector::default()), + Box::new(ProcessResourceDetector), + Box::new(OsResourceDetector), + // Box::new(OsResourceDetector::new()), + ]) + .build() +}); + +fn init_logger() -> Option { + if !ARGS.otel_logs.unwrap_or(true) { + return None; + } + let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); + let logger_provider = SdkLoggerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_batch_exporter(otlp_log_exporter) + .build(); + + Some(logger_provider) +} + +fn init_meter() -> Option { + if !ARGS.otel_metrics.unwrap_or(true) { + return None; + } + let otlp_metric_exporter = MetricExporter::builder() + .with_tonic() + .with_temporality(opentelemetry_sdk::metrics::Temporality::Cumulative) + .build() + .unwrap(); + + let periodic_reader = PeriodicReader::builder(otlp_metric_exporter) + .with_interval(std::time::Duration::from_secs(5)) + .build(); + + let meter_provider_builder = SdkMeterProvider::builder() + .with_resource(RESOURCE.clone()) + .with_reader(periodic_reader); + let meter_provider = meter_provider_builder.build(); + global::set_meter_provider(meter_provider.clone()); + Some(meter_provider) +} + +fn init_tracer() -> Option { + if !ARGS.otel_tracing.unwrap_or(true) { + return None; + } + global::set_text_map_propagator(TraceContextPropagator::new()); + let otlp_span_exporter = SpanExporter::builder().with_tonic().build().unwrap(); + + let tracer_provider = SdkTracerProvider::builder() + .with_resource(RESOURCE.clone()) + .with_sampler(Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased( + 1.0, + )))) + .with_id_generator(RandomIdGenerator::default()) + .with_batch_exporter(otlp_span_exporter) + .build(); + global::set_tracer_provider(tracer_provider.clone()); + + Some(tracer_provider) +} + +/// Manages the lifetimes of the opentelemetry providers +pub struct OtelProviders { + tracer_provider: Option, + meter_provider: Option, + logger_provider: Option, + /// Flag to indicate if the observability providers have been shutdown + shutdown: Mutex, +} + +impl OtelProviders { + /// Create a new set of observability providers + /// + /// Will panic if called more than once to prevent double initialization + /// + /// The providers will be shutdown automatically when the last reference to this struct is dropped + pub fn new() -> Self { + static ALREADY_INITIALIZED: AtomicBool = AtomicBool::new(false); + if !ALREADY_INITIALIZED.fetch_and(true, Ordering::SeqCst) { + panic!("OtelProviders::new() called more than once"); + } + + let tracer_provider = init_tracer(); + let meter_provider = init_meter(); + let logger_provider = init_logger(); + + Self { + tracer_provider, + meter_provider, + logger_provider, + shutdown: Mutex::new(false), + } + } + + /// Shutdown the observability providers + /// + /// Does nothing if already shutdown + pub fn shutdown(&self) { + let shutdown = self.shutdown.lock(); + if shutdown.as_ref().map_or(false, |shutdown| **shutdown) { + // Already shutdown + return; + } + eprintln!("Shutting down observability"); + if let Some(tracer_provider) = &self.tracer_provider { + if let Err(err) = tracer_provider.shutdown() { + eprintln!("Error shutting down otel tracer: {err:?}"); + } + } + if let Some(meter_provider) = &self.meter_provider { + if let Err(err) = meter_provider.shutdown() { + eprintln!("Error shutting down otel meter: {err:?}"); + } + } + if let Some(logger_provider) = &self.logger_provider { + if let Err(err) = logger_provider.shutdown() { + eprintln!("Error shutting down otel logger: {err:?}"); + } + } + if let Ok(mut shutdown) = shutdown { + // Mark as shutdown + *shutdown = true; + } + } + + /// Returns a layer that exports tracing spans to opentelemetry if otel-tracing is enabled + fn otel_tracer_layer(&self) -> Option> + where + S: Subscriber + Sync + Send + for<'span> LookupSpan<'span>, + { + let Some(tracer_provider) = &self.tracer_provider else { + return None; + }; + // Exports tracing traces to opentelemetry + let tracing_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let tracer = tracer_provider.tracer("tracing-otel-subscriber"); + let tracing_layer = + tracing_opentelemetry::OpenTelemetryLayer::new(tracer).with_filter(tracing_filter); + Some(tracing_layer) + } + + /// Returns a layer that exports logs to opentelemetry if otel-logs is enabled + fn otel_logger_layer(&self) -> Option> + where + S: Subscriber + Sync + Send + for<'span> LookupSpan<'span>, + { + let Some(logger_provider) = &self.logger_provider else { + return None; + }; + // Exports logs to otel + let otel_log_filter = EnvFilter::new("info") + .add_directive("hyper=off".parse().unwrap()) + .add_directive("h2=off".parse().unwrap()) + .add_directive("opentelemetry=off".parse().unwrap()) + .add_directive("tonic=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()); + let otel_log_layer = + OpenTelemetryTracingBridge::new(logger_provider).with_filter(otel_log_filter); + + Some(otel_log_layer) + } + + /// Returns a layer that exports tracing metrics to opentelemetry if otel-metrics is enabled + fn otel_metrics_layer(&self) -> Option> + where + S: Subscriber + Sync + Send + for<'span> LookupSpan<'span>, + { + let Some(meter_provider) = &self.meter_provider else { + return None; + }; + + Some(tracing_opentelemetry::MetricsLayer::new( + meter_provider.clone(), + )) + } + + /// Get a tracing layer for otel logging, tracing, and metrics + pub fn tracing_layers(&self) -> impl Layer + where + S: Subscriber + Sync + Send + for<'span> LookupSpan<'span>, + { + let mut layers: Vec + Send + Sync + 'static>> = vec![]; + if let Some(tracer_layer) = self.otel_tracer_layer() { + layers.push(Box::new(tracer_layer)); + } + if let Some(logger_layer) = self.otel_logger_layer() { + layers.push(Box::new(logger_layer)); + } + if let Some(metrics_layer) = self.otel_metrics_layer() { + layers.push(Box::new(metrics_layer)); + } + return layers; + } +} + +impl Drop for OtelProviders { + fn drop(&mut self) { + self.shutdown(); + } +} From 49739216f56f03b5653717510ef61909d4b85df0 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 14:07:44 +0100 Subject: [PATCH 49/75] Remove dead code in repo indexer --- src/database/repo_indexer.rs | 363 +------------------- src/database/repo_indexer/buffered_items.rs | 190 ---------- 2 files changed, 11 insertions(+), 542 deletions(-) delete mode 100644 src/database/repo_indexer/buffered_items.rs diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index e3d7c94..f8c0118 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,6 +1,5 @@ -use std::{ops::Rem, os::unix::process, sync::LazyLock}; - -use atrium_api::com::atproto::repo; +use super::connect; +use crate::config::ARGS; use futures::{stream::FuturesUnordered, StreamExt}; use index_repo::PipelineItem; use opentelemetry::{global, KeyValue}; @@ -8,14 +7,10 @@ use pumps::Concurrency; use repo_stream::RepoStream; use reqwest::Client; use serde::{Deserialize, Serialize}; +use std::{ops::Rem, sync::LazyLock}; use surrealdb::{engine::any::Any, Surreal}; -use tracing::{error, warn}; +use tracing::warn; -use crate::config::ARGS; - -use super::connect; - -// mod buffered_items; mod index_repo; mod repo_stream; @@ -30,67 +25,6 @@ pub struct LastIndexedTimestamp { /// An ID that was used before the earliest data we are interested in const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; -// Make this less hacky -macro_rules! stage { - ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $item:ident -> $content:expr) => { - |$item| async { - // TODO: Dont create new keyvalues every time - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "queued"), - ], - ); - $metric.add( - 1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - eprintln!("starting {}", $stage); - tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; - let before = std::time::Instant::now(); - eprintln!("finished {}", $stage); - - let result = tokio::task::spawn($content).await?; - - let duration = before.elapsed(); - - $perfmetric.record( - duration.as_millis() as u64, - &[KeyValue::new("stage", $stage)], - ); - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - $metric.add( - 1, - &[ - KeyValue::new("stage", $next), - KeyValue::new("state", "queued"), - ], - ); - - result - } - }; -} - -macro_rules! done { - ("done", $a:expr, $b:expr) => { - $a - }; - ($idk:literal, $a:expr, $b:expr) => { - $b - }; -} - // Make this less hacky macro_rules! pump_stage { ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $function:ident) => { @@ -178,32 +112,7 @@ macro_rules! pump_stage { }; } -// Make this less hacky -macro_rules! filter_result { - ($metric:ident, $stage:literal) => {|result| async { - if let Err(error) = &result { - error!(target: "indexer", "Failed to index repo: {}", error); - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "queued"), - ], - ); - return None; - } - result.ok() - }}; -} - -// async fn filter_result(result: anyhow::Result) -> Option { -// if let Err(error) = &result { -// error!(target: "indexer", "Failed to index repo: {}", error); -// } -// result.ok() -// } - -const tracker: LazyLock> = LazyLock::new(|| { +const TRACKER: LazyLock> = LazyLock::new(|| { global::meter("indexer") .i64_up_down_counter("indexer.pipeline.location") .with_description("Track the number of tasks in the pipeline") @@ -220,11 +129,6 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .with_description("Total number of indexed repos") .with_unit("repo") .build(); - // let tracker: opentelemetry::metrics::UpDownCounter = meter - // .i64_up_down_counter("indexer.pipeline.location") - // .with_description("Track the number of tasks in the pipeline") - // .with_unit("repo") - // .build(); let job_duration = meter .u64_histogram("indexer.pipeline.duration") .with_unit("ms") @@ -267,17 +171,12 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { ) }); - // let urls_list = vec!["1"; 1000]; - // let urls = urls_list - // .into_iter() - // .map(move |x| (x.to_string(), db.clone(), http_client.clone())); - let (mut output_receiver, _join_handle) = pumps::Pipeline::from_stream(dids) .map( |(did, db, http_client)| async { let item = PipelineItem::new(db, http_client, did); - tracker.add( + TRACKER.add( 1, &[ KeyValue::new("stage", "get_service"), @@ -291,7 +190,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .backpressure(buffer_size) .filter_map( pump_stage!( - tracker, + TRACKER, job_duration, "get_service", "download_repo", @@ -302,7 +201,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .backpressure(buffer_size) .filter_map( pump_stage!( - tracker, + TRACKER, job_duration, "download_repo", "process_repo", @@ -315,7 +214,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .backpressure(buffer_size) .filter_map( pump_stage!( - tracker, + TRACKER, job_duration, "process_repo", "apply_updates", @@ -326,7 +225,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .backpressure(buffer_size) .filter_map( pump_stage!( - tracker, + TRACKER, job_duration, "apply_updates", "print_report", @@ -336,7 +235,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { ) .backpressure(buffer_size) .filter_map( - pump_stage!(tracker, job_duration, "print_report", "done", print_report), + pump_stage!(TRACKER, job_duration, "print_report", "done", print_report), Concurrency::concurrent_unordered(concurrent_elements), ) .backpressure(buffer_size) @@ -355,245 +254,5 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { eprintln!("Finished: {}", elements); } - // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) - // .map(|did| async { - // let db = db.clone(); - // let http_client = http_client.clone(); - // let item = PipelineItem::new(db, http_client, did); - - // tracker.add( - // 1, - // &[ - // KeyValue::new("stage", "get_service"), - // KeyValue::new("state", "queued"), - // ], - // ); - // item - // }) - // .buffer_unordered(buffer_size) - // .map( - // stage!(tracker, job_duration, "get_service", "download_repo", item -> - // item.get_service() - // ), - // ) - // .buffer_unordered(buffer_size) - // .filter_map(filter_result!(tracker, "download_repo")) - // .map( - // stage!(tracker, job_duration, "download_repo", "process_repo", item -> - // item.download_repo() - // ), - // ) - // .buffer_unordered(buffer_size * download_buffer_multiplier) - // .filter_map(filter_result!(tracker, "process_repo")) - // .map( - // stage!(tracker, job_duration, "process_repo", "apply_updates", item -> - // item.process_repo() - // ), - // ) - // .buffer_unordered(buffer_size) - // .filter_map(filter_result!(tracker, "apply_updates")) - // .map( - // stage!(tracker, job_duration, "apply_updates", "print_report", item -> - // // println!("Items: {:?}", item.state.updates.len()); - // item.apply_updates() - // ), - // ) - // .buffer_unordered(buffer_size) - // .filter_map(filter_result!(tracker, "print_report")) - // .for_each(|x| async { - // tracker.add( - // -1, - // &[ - // KeyValue::new("stage", "print_report"), - // KeyValue::new("state", "queued"), - // ], - // ); - // tracker.add( - // 1, - // &[ - // KeyValue::new("stage", "print_report"), - // KeyValue::new("state", "active"), - // ], - // ); - // x.print_report().await; - // tracker.add( - // -1, - // &[ - // KeyValue::new("stage", "print_report"), - // KeyValue::new("state", "active"), - // ], - // ); - // repos_indexed.add(1, &[]); - // }) - // .await; - - // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) - // .map(async |did| { - // let db = db.clone(); - // let http_client = http_client.clone(); - - // tokio::task::spawn(process_did(did, db, http_client)).await? - // }) - // .buffered(buffer_size) - // .filter_map(|result| async { - // if let Err(error) = &result { - // error!(target: "indexer", "Failed to index repo: {}", error); - - // return None; - // } - // result.ok() - // }) - // .for_each(|x| async { - // repos_indexed.add(1, &[]); - // }) - // .await; - - // panic!("Done, this should not happen"); - Ok(()) -} - -// pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { -// let http_client = Client::new(); - -// let meter = global::meter("indexer"); -// let repos_indexed = meter -// .u64_counter("indexer.repos.indexed") -// .with_description("Total number of indexed repos") -// .with_unit("repo") -// .build(); -// let tracker = meter -// .i64_up_down_counter("indexer.pipeline.location") -// .with_description("Track the number of tasks in the pipeline") -// .with_unit("repo") -// .build(); -// let job_duration = meter -// .u64_histogram("indexer.pipeline.duration") -// .with_unit("ms") -// .with_description("Pipeline job duration") -// .with_boundaries( -// vec![1, 3, 10, 31, 100, 316, 1000, 3160, 10000] -// .iter() -// .map(|x| *x as f64 + 1000.0) -// .collect::>(), -// ) -// .build(); - -// let mut res = db -// .query("SELECT count() as c FROM latest_backfill WHERE at != NONE GROUP ALL;") -// .await -// .unwrap(); -// let count = res.take::>((0, "c")).unwrap().unwrap_or(0); -// if count == 0 { -// warn!("Started with 0 repos, this might be a bug"); -// } -// repos_indexed.add(count as u64, &[]); - -// let buffer_size = ARGS.pipeline_buffer_size; -// let download_buffer_multiplier = ARGS.download_buffer_multiplier; - -// RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) -// .map(|did| async { -// let db = db.clone(); -// let http_client = http_client.clone(); -// let item = PipelineItem::new(db, http_client, did); - -// tracker.add( -// 1, -// &[ -// KeyValue::new("stage", "get_service"), -// KeyValue::new("state", "queued"), -// ], -// ); -// item -// }) -// .buffer_unordered(buffer_size) -// .map( -// stage!(tracker, job_duration, "get_service", "download_repo", item -> -// item.get_service() -// ), -// ) -// .buffer_unordered(buffer_size) -// .filter_map(filter_result!(tracker, "download_repo")) -// .map( -// stage!(tracker, job_duration, "download_repo", "process_repo", item -> -// item.download_repo() -// ), -// ) -// .buffer_unordered(buffer_size * download_buffer_multiplier) -// .filter_map(filter_result!(tracker, "process_repo")) -// .map( -// stage!(tracker, job_duration, "process_repo", "apply_updates", item -> -// item.process_repo() -// ), -// ) -// .buffer_unordered(buffer_size) -// .filter_map(filter_result!(tracker, "apply_updates")) -// .map( -// stage!(tracker, job_duration, "apply_updates", "print_report", item -> -// // println!("Items: {:?}", item.state.updates.len()); -// item.apply_updates() -// ), -// ) -// .buffer_unordered(buffer_size) -// .filter_map(filter_result!(tracker, "print_report")) -// .for_each(|x| async { -// tracker.add( -// -1, -// &[ -// KeyValue::new("stage", "print_report"), -// KeyValue::new("state", "queued"), -// ], -// ); -// tracker.add( -// 1, -// &[ -// KeyValue::new("stage", "print_report"), -// KeyValue::new("state", "active"), -// ], -// ); -// x.print_report().await; -// tracker.add( -// -1, -// &[ -// KeyValue::new("stage", "print_report"), -// KeyValue::new("state", "active"), -// ], -// ); -// repos_indexed.add(1, &[]); -// }) -// .await; - -// // RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()) -// // .map(async |did| { -// // let db = db.clone(); -// // let http_client = http_client.clone(); - -// // tokio::task::spawn(process_did(did, db, http_client)).await? -// // }) -// // .buffered(buffer_size) -// // .filter_map(|result| async { -// // if let Err(error) = &result { -// // error!(target: "indexer", "Failed to index repo: {}", error); - -// // return None; -// // } -// // result.ok() -// // }) -// // .for_each(|x| async { -// // repos_indexed.add(1, &[]); -// // }) -// // .await; - -// // panic!("Done, this should not happen"); -// Ok(()) -// } - -async fn process_did(did: String, db: Surreal, client: Client) -> anyhow::Result<()> { - let item = PipelineItem::new(db, client, did); - let item = item.get_service().await?; - let item = item.download_repo().await?; - let item = item.process_repo().await?; - let item = item.apply_updates().await?; - item.print_report().await; Ok(()) } diff --git a/src/database/repo_indexer/buffered_items.rs b/src/database/repo_indexer/buffered_items.rs deleted file mode 100644 index b9014eb..0000000 --- a/src/database/repo_indexer/buffered_items.rs +++ /dev/null @@ -1,190 +0,0 @@ -// use crate::stream::{Fuse, FuturesUnordered, StreamExt}; -use core::fmt; -use futures::stream::FuturesUnordered; -use futures::{Sink, Stream, StreamExt}; -use std::pin::Pin; -use std::task::Poll; -// use futures_core::task::{Context, Poll}; -use pin_project_lite::pin_project; -use std::future::Future; -// use std::stream::{FusedStream, Stream}; - -// pin_project! { -/// Stream for the [`buffer_unordered`](super::StreamExt::buffer_unordered) -/// method. -#[must_use = "streams do nothing unless polled"] -pub struct BufferItems -where - St: Stream, -{ - stream: St, - in_progress_queue: FuturesUnordered, - max: usize, -} -// } - -impl fmt::Debug for BufferItems -where - St: Stream + fmt::Debug, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("BufferItems") - .field("stream", &self.stream) - .field("in_progress_queue", &self.in_progress_queue) - .field("max", &self.max) - .finish() - } -} - -impl BufferItems -where - St: Stream, - St::Item: Future, -{ - pub fn new(stream: St, n: usize) -> Self { - Self { - stream: stream, - in_progress_queue: FuturesUnordered::new(), - max: n, - } - } - - // delegate_access_inner!(stream, St, (.)); -} - -impl Stream for BufferItems -where - St: Stream, - St::Item: Future, -{ - type Item = ::Output; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - let mut this = self; - - // First up, try to spawn off as many futures as possible by filling up - // our queue of futures. - while this.in_progress_queue.len() < *this.max { - match this.stream.poll_next_unpin(cx) { - Poll::Ready(Some(fut)) => this.in_progress_queue.push(fut), - Poll::Ready(None) | Poll::Pending => break, - } - } - - // Attempt to pull the next value from the in_progress_queue - match this.in_progress_queue.poll_next_unpin(cx) { - x @ Poll::Pending | x @ Poll::Ready(Some(_)) => return x, - Poll::Ready(None) => {} - } - - // If more values are still coming from the stream, we're not done yet - if this.stream.is_done() { - Poll::Ready(None) - } else { - Poll::Pending - } - } - - fn size_hint(&self) -> (usize, Option) { - let queue_len = self.in_progress_queue.len(); - let (lower, upper) = self.stream.size_hint(); - let lower = lower.saturating_add(queue_len); - let upper = match upper { - Some(x) => x.checked_add(queue_len), - None => None, - }; - (lower, upper) - } -} - -// impl FusedStream for BufferItems -// where -// St: Stream, -// St::Item: Future, -// { -// fn is_terminated(&self) -> bool { -// self.in_progress_queue.is_terminated() && self.stream.is_terminated() -// } -// } - -// impl Stream for BufferItems -// where -// St: Stream, -// St::Item: Future, -// { -// type Item = ::Output; - -// fn poll_next( -// mut self: std::pin::Pin<&mut Self>, -// cx: &mut std::task::Context<'_>, -// ) -> Poll> { -// let mut this = self.project(); - -// // First up, try to spawn off as many futures as possible by filling up -// // our queue of futures. -// while this.in_progress_queue.len() < *this.max { -// match this.stream.as_mut().poll_next(cx) { -// Poll::Ready(Some(fut)) => this.in_progress_queue.push(fut), -// Poll::Ready(None) | Poll::Pending => break, -// } -// } - -// // Attempt to pull the next value from the in_progress_queue -// match this.in_progress_queue.poll_next_unpin(cx) { -// x @ Poll::Pending | x @ Poll::Ready(Some(_)) => return x, -// Poll::Ready(None) => {} -// } - -// // If more values are still coming from the stream, we're not done yet -// if this.stream.is_done() { -// Poll::Ready(None) -// } else { -// Poll::Pending -// } -// } - -// fn size_hint(&self) -> (usize, Option) { -// let queue_len = self.in_progress_queue.len(); -// let (lower, upper) = self.stream.size_hint(); -// let lower = lower.saturating_add(queue_len); -// let upper = match upper { -// Some(x) => x.checked_add(queue_len), -// None => None, -// }; -// (lower, upper) -// } -// } - -// Forwarding impl of Sink from the underlying stream -// impl Sink for BufferItems -// where -// S: Stream + Sink, -// S::Item: Future, -// { -// type Error = S::Error; - -// fn poll_ready( -// self: Pin<&mut Self>, -// cx: &mut core::task::Context<'_>, -// ) -> core::task::Poll> { -// self.project().stream.poll_ready(cx) -// } -// fn start_send(self: Pin<&mut Self>, item: Item) -> Result<(), Self::Error> { -// self.project().stream.start_send(item) -// } -// fn poll_flush( -// self: Pin<&mut Self>, -// cx: &mut core::task::Context<'_>, -// ) -> core::task::Poll> { -// self.project().stream.poll_flush(cx) -// } -// fn poll_close( -// self: Pin<&mut Self>, -// cx: &mut core::task::Context<'_>, -// ) -> core::task::Poll> { -// self.project().stream.poll_close(cx) -// } -// } From 4d43e0f169578bddc43d8f120207000fa0bbab6d Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 16:44:45 +0100 Subject: [PATCH 50/75] Fix logging initialization --- src/observability/otel_providers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/observability/otel_providers.rs b/src/observability/otel_providers.rs index 3c84fbb..4ce948e 100644 --- a/src/observability/otel_providers.rs +++ b/src/observability/otel_providers.rs @@ -154,7 +154,7 @@ impl OtelProviders { /// The providers will be shutdown automatically when the last reference to this struct is dropped pub fn new() -> Self { static ALREADY_INITIALIZED: AtomicBool = AtomicBool::new(false); - if !ALREADY_INITIALIZED.fetch_and(true, Ordering::SeqCst) { + if ALREADY_INITIALIZED.fetch_and(true, Ordering::SeqCst) { panic!("OtelProviders::new() called more than once"); } From 403625cb55d4f07a4d51c2e3c2f7d99bb6052632 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 17:50:00 +0100 Subject: [PATCH 51/75] Rework pipeline using generics --- src/database/repo_indexer.rs | 232 +++------------------- src/database/repo_indexer/index_repo.rs | 180 ++++++++++++++--- src/database/repo_indexer/pipeline.rs | 243 +++++++++++++++++++++++ src/database/repo_indexer/repo_stream.rs | 6 +- 4 files changed, 422 insertions(+), 239 deletions(-) create mode 100644 src/database/repo_indexer/pipeline.rs diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index f8c0118..69e43f2 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -2,159 +2,32 @@ use super::connect; use crate::config::ARGS; use futures::{stream::FuturesUnordered, StreamExt}; use index_repo::PipelineItem; -use opentelemetry::{global, KeyValue}; -use pumps::Concurrency; +use pipeline::{create_stage, next_stage}; use repo_stream::RepoStream; use reqwest::Client; -use serde::{Deserialize, Serialize}; -use std::{ops::Rem, sync::LazyLock}; +use std::ops::Rem; use surrealdb::{engine::any::Any, Surreal}; -use tracing::warn; +use tracing::error; mod index_repo; +mod pipeline; mod repo_stream; -/// Database struct for a repo indexing timestamp -#[derive(Debug, Serialize, Deserialize)] -pub struct LastIndexedTimestamp { - pub time_us: u64, - pub time_dt: surrealdb::Datetime, - pub error: Option, -} - -/// An ID that was used before the earliest data we are interested in -const OLDEST_USEFUL_ANCHOR: &str = "3juj4"; - -// Make this less hacky -macro_rules! pump_stage { - ($metric:ident, $perfmetric:ident, $stage:literal, $next:literal, $function:ident) => { - |x| async { - tokio::task::spawn(async move { - eprintln!("starting {}", $stage); - - // TODO: Dont create new keyvalues every time - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "queued"), - ], - ); - $metric.add( - 1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - tokio::time::sleep(::tokio::time::Duration::from_secs(1)).await; - let before = std::time::Instant::now(); - let result = tokio::time::timeout( - tokio::time::Duration::from_secs(ARGS.pipeline_stage_timeout), - x.$function(), - ) - .await; - let duration = before.elapsed(); - eprintln!( - "pre finished {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); - let Ok(result) = result else { - panic!("Timeout in {}", $stage); - }; - - // $perfmetric.record( - // duration.as_millis() as u64, - // &[KeyValue::new("stage", $stage)], - // ); - $metric.add( - -1, - &[ - KeyValue::new("stage", $stage), - KeyValue::new("state", "active"), - ], - ); - - let result = match result { - Err(error) => { - eprintln!( - "failed {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); - - // error!(target: "indexer", "Failed to index repo: {}", error); - return None; - } - Ok(result) => result, - }; - - if $next != "done" { - $metric.add( - 1, - &[ - KeyValue::new("stage", $next), - KeyValue::new("state", "queued"), - ], - ); - } - eprintln!( - "finished {} in {:02}", - $stage, - duration.as_millis() as f64 / 1000.0 - ); - return Some(result); - }) - .await - .expect("Failed to spawn task in a pump stage") - } +macro_rules! unordered { + ($concurrency:expr) => { + pumps::Concurrency::concurrent_unordered($concurrency) }; } -const TRACKER: LazyLock> = LazyLock::new(|| { - global::meter("indexer") - .i64_up_down_counter("indexer.pipeline.location") - .with_description("Track the number of tasks in the pipeline") - .with_unit("repo") - .build() -}); - pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let http_client = Client::new(); - let meter = global::meter("indexer"); - let repos_indexed = meter - .u64_counter("indexer.repos.indexed") - .with_description("Total number of indexed repos") - .with_unit("repo") - .build(); - let job_duration = meter - .u64_histogram("indexer.pipeline.duration") - .with_unit("ms") - .with_description("Pipeline job duration") - .with_boundaries( - vec![1, 3, 10, 31, 100, 316, 1000, 3160, 10000] - .iter() - .map(|x| *x as f64 + 1000.0) - .collect::>(), - ) - .build(); - - let mut res = db - .query("SELECT count() as c FROM latest_backfill WHERE at != NONE GROUP ALL;") - .await - .unwrap(); - let count = res.take::>((0, "c")).unwrap().unwrap_or(0); - if count == 0 { - warn!("Started with 0 repos, this might be a bug"); - } - repos_indexed.add(count as u64, &[]); - let buffer_size = ARGS.pipeline_buffer_size; let download_concurrency_multiplier = ARGS.pipeline_download_concurrency_multiplier; let concurrent_elements = ARGS.pipeline_concurrent_elements; + let download_concurrent_elements = concurrent_elements * download_concurrency_multiplier; + // Connect to all supplied databases. The writes will be distributed across them let databases = ARGS .db .iter() @@ -162,8 +35,9 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { .collect::>() .collect::>() .await; - let repo_stream = RepoStream::new(OLDEST_USEFUL_ANCHOR.to_string(), db.clone()); - let dids = repo_stream.enumerate().map(move |(id, x)| { + + // Create a stream of dids + captured database and http client + let dids = RepoStream::new(db.clone()).enumerate().map(move |(id, x)| { ( x.to_string(), databases.get(id.rem(databases.len())).unwrap().clone(), @@ -171,88 +45,28 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { ) }); + // Create the processing pipeline let (mut output_receiver, _join_handle) = pumps::Pipeline::from_stream(dids) - .map( - |(did, db, http_client)| async { - let item = PipelineItem::new(db, http_client, did); - - TRACKER.add( - 1, - &[ - KeyValue::new("stage", "get_service"), - KeyValue::new("state", "queued"), - ], - ); - item - }, - Concurrency::concurrent_unordered(concurrent_elements), - ) - .backpressure(buffer_size) .filter_map( - pump_stage!( - TRACKER, - job_duration, - "get_service", - "download_repo", - get_service - ), - Concurrency::concurrent_unordered(concurrent_elements), + create_stage(|(did, db, http_client)| PipelineItem::new(db, http_client, did)), + unordered!(concurrent_elements), ) .backpressure(buffer_size) - .filter_map( - pump_stage!( - TRACKER, - job_duration, - "download_repo", - "process_repo", - download_repo - ), - Concurrency::concurrent_unordered( - concurrent_elements * download_concurrency_multiplier, - ), - ) + .filter_map(next_stage(), unordered!(concurrent_elements)) .backpressure(buffer_size) - .filter_map( - pump_stage!( - TRACKER, - job_duration, - "process_repo", - "apply_updates", - process_repo - ), - Concurrency::concurrent_unordered(concurrent_elements), - ) + .filter_map(next_stage(), unordered!(download_concurrent_elements)) .backpressure(buffer_size) - .filter_map( - pump_stage!( - TRACKER, - job_duration, - "apply_updates", - "print_report", - apply_updates - ), - Concurrency::concurrent_unordered(concurrent_elements), - ) + .filter_map(next_stage(), unordered!(concurrent_elements)) .backpressure(buffer_size) - .filter_map( - pump_stage!(TRACKER, job_duration, "print_report", "done", print_report), - Concurrency::concurrent_unordered(concurrent_elements), - ) + .filter_map(next_stage(), unordered!(concurrent_elements)) .backpressure(buffer_size) - // .map(download_heavy_resource, Concurrency::serial()) - // .filter_map(run_algorithm, Concurrency::concurrent_unordered(concurrent_elements)) - // .map(save_to_db, Concurrency::concurrent_unordered(100)) .build(); - // join_handle.await; - let mut elements = 0; + + // Process items loop { let Some(_result) = output_receiver.recv().await else { - panic!("Done, this should not happen"); + error!("Backfill pipeline ran out of items. This should never happen."); + panic!("Backfill pipeline ran out of items. This should never happen."); }; - elements += 1; - repos_indexed.add(1, &[]); - eprintln!("Finished: {}", elements); } - - Ok(()) } diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 0677091..daccd81 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -3,6 +3,7 @@ use crate::{ database::{ definitions::Record, handlers::{on_commit_event_createorupdate, BigUpdate}, + repo_indexer::pipeline::NoNextStage, }, }; use atrium_api::{ @@ -15,6 +16,7 @@ use serde::Deserialize; use serde_ipld_dagcbor::from_reader; use std::{ collections::HashMap, + future::Future, io::Read, string::FromUtf8Error, sync::LazyLock, @@ -24,6 +26,8 @@ use surrealdb::{engine::any::Any, opt::PatchOp, RecordId, Surreal}; use tokio::task::spawn_blocking; use tracing::{info, instrument, span, trace, warn, Level, Span}; +use super::pipeline::Stage; + type Cid = ipld_core::cid::Cid; /// There should only be one request client to make use of connection pooling @@ -282,8 +286,6 @@ pub struct WithUpdates { now: std::time::Duration, pub update: BigUpdate, } -/// Updates have been applied -pub struct Done {} pub struct PipelineItem { db: Surreal, @@ -310,29 +312,157 @@ impl PipelineItem { } } -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn get_service(self) -> anyhow::Result> { - let service = get_plc_service(&self.http_client, &self.did).await?; - let Some(service) = service else { - // TODO: Handle this better, as this is not really an error - return Err(anyhow::anyhow!("Failed to get a plc service")); +impl Stage for PipelineItem { + type Output = PipelineItem; + const NAME: &str = "download_information"; + // type F = Future> + Send + Sync + 'static; + + fn run(self) -> impl Future> + Send + Sync + 'static { + return async { + let service = get_plc_service(&self.http_client, &self.did).await?; + let Some(service) = service else { + // TODO: Handle this better, as this is not really an error + return Err(anyhow::anyhow!("Failed to get a plc service")); + }; + Ok(PipelineItem:: { + state: WithService { + service: service, + now: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap(), + }, + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, + }) + }; + } +} + +impl Stage for PipelineItem { + type Output = PipelineItem; + const NAME: &str = "download_repo"; + // type F = Future> + Send + Sync + 'static; + + fn run(self) -> impl Future> + Send + Sync + 'static { + return async move { + let repo = download_repo(&self.state.service, &self.did).await?; + Ok(PipelineItem:: { + state: WithRepo { + now: self.state.now, + repo, + }, + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, + }) + }; + } +} + +impl Stage for PipelineItem { + type Output = PipelineItem; + const NAME: &str = "process_repo"; + // type F = Future> + Send + Sync + 'static; + + fn run(self) -> impl Future> + Send + Sync + 'static { + return async move { + // info!("Deserializing repo {}", self.did); + let did = self.did.clone(); + let big_update = spawn_blocking(move || { + let files: HashMap, Vec> = + deserialize_repo(self.state.repo)?; + let updates = files_to_updates_blocking(files)?; + let mut big_update = create_big_update(&did, updates)?; + + big_update.add_timestamp(&did, surrealdb::sql::Datetime::from(chrono::Utc::now())); + Result::::Ok(big_update) + }) + .await??; + + Ok(PipelineItem:: { + state: WithUpdates { + now: self.state.now, + update: big_update, + }, + db: self.db, + http_client: self.http_client, + did: self.did, + span: self.span, + }) }; - Ok(PipelineItem:: { - state: WithService { - service: service, - now: std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap(), - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) } } +impl Stage for PipelineItem { + type Output = NoNextStage; + const NAME: &str = "apply_updates"; + // type F = Future> + Send + Sync + 'static; + + fn run(self) -> impl Future> + Send + Sync + 'static { + return async move { + let start = Instant::now(); + + if !ARGS.dont_write_when_backfilling.unwrap_or(false) { + self.state.update.apply(&self.db).await?; + } else { + eprintln!("Skipping writing to the database and sleeping instead"); + std::thread::sleep(Duration::from_secs(1)); + } + let duration = start.elapsed(); + eprintln!("Big update took {:?}", duration); + warn!("Big update took {:?}", duration); + // let _: Option = &self + // .db + // .update(("latest_backfill", did_key.clone())) + // .patch(PatchOp::replace( + // "at", + // surrealdb::sql::Datetime::from(chrono::Utc::now()), + // )) + // .await?; + Ok(NoNextStage {}) + }; + } +} + +// impl Stage for PipelineItem { +// type Output = PipelineItem; +// const NAME: &str = "done"; +// // type F = Future> + Send + Sync + 'static; + +// fn run(self) -> impl Future> + Send + Sync + 'static { +// return async move { +// trace!("Indexed {}", self.did); +// Ok(self) +// }; +// } +// } + +// impl PipelineItem { +// #[instrument(skip(self), parent = &self.span)] +// pub async fn get_service(self) -> anyhow::Result> { +// let service = get_plc_service(&self.http_client, &self.did).await?; +// let Some(service) = service else { +// // TODO: Handle this better, as this is not really an error +// return Err(anyhow::anyhow!("Failed to get a plc service")); +// }; +// Ok(PipelineItem:: { +// state: WithService { +// service: service, +// now: std::time::SystemTime::now() +// .duration_since(std::time::UNIX_EPOCH) +// .unwrap(), +// }, +// db: self.db, +// http_client: self.http_client, +// did: self.did, +// span: self.span, +// }) +// } +// } + impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn download_repo(self) -> anyhow::Result> { @@ -381,7 +511,7 @@ impl PipelineItem { impl PipelineItem { #[instrument(skip(self), parent = &self.span)] - pub async fn apply_updates(self) -> anyhow::Result> { + pub async fn apply_updates(self) -> anyhow::Result> { let start = Instant::now(); if !ARGS.dont_write_when_backfilling.unwrap_or(false) { @@ -401,8 +531,8 @@ impl PipelineItem { // surrealdb::sql::Datetime::from(chrono::Utc::now()), // )) // .await?; - Ok(PipelineItem:: { - state: Done {}, + Ok(PipelineItem:: { + state: NoNextStage {}, db: self.db, http_client: self.http_client, did: self.did, @@ -411,7 +541,7 @@ impl PipelineItem { } } -impl PipelineItem { +impl PipelineItem { #[instrument(skip(self), parent = &self.span)] pub async fn print_report(self) -> anyhow::Result<()> { // TODO: This is only for printing debug stuff diff --git a/src/database/repo_indexer/pipeline.rs b/src/database/repo_indexer/pipeline.rs new file mode 100644 index 0000000..c3e5c0f --- /dev/null +++ b/src/database/repo_indexer/pipeline.rs @@ -0,0 +1,243 @@ +use crate::config::ARGS; +use futures::FutureExt; +use opentelemetry::{ + global, + metrics::{Counter, Histogram, UpDownCounter}, + KeyValue, +}; +use std::{ + future::Future, + marker::PhantomData, + pin::Pin, + sync::{Arc, LazyLock}, +}; +use tracing::{error, trace}; + +pub struct NoNextStage {} +pub trait NextStage { + const NAME: &'static str; + const DONE: bool; +} +impl NextStage for T { + const NAME: &'static str = T::NAME; + const DONE: bool = false; +} +impl NextStage for NoNextStage { + const NAME: &'static str = "Done"; + const DONE: bool = true; +} + +pub trait Stage { + type Output: NextStage + Sync + Send + 'static; + const NAME: &'static str; + const FIRST: bool = false; + fn run(self) -> impl Future> + Send + Sync + 'static; +} + +pub struct FirstStage< + I: Sync + Send + 'static, + O: Stage + Sync + Send + 'static, + F: Fn(I) -> O + Sync + Send + 'static, +> { + a: I, + b: PhantomData, + f: Arc, +} +impl< + I: Sync + Send + 'static, + O: Stage + Sync + Send + 'static, + F: Fn(I) -> O + Sync + Send + 'static, + > Stage for FirstStage +{ + type Output = O; + const NAME: &'static str = "First"; + const FIRST: bool = true; + fn run(self) -> impl Future> + Send + Sync + 'static { + async move { Ok((self.f)(self.a)) } + } +} + +pub fn create_stage< + I: Sync + Send + 'static, + O: Stage + Sync + Send + 'static, + F: Fn(I) -> O + Sync + Send + 'static, +>( + f: F, +) -> impl Fn(I) -> Pin> + Send + 'static>> { + let next_stage_fn = next_stage::>(); + let boxedfn = Arc::new(f); + + return move |x| { + let first_stage = FirstStage:: { + a: x, + b: PhantomData, + f: boxedfn.clone(), + }; + return (next_stage_fn)(first_stage); + }; +} + +pub fn next_stage( +) -> impl Fn(FROM) -> Pin> + Send + 'static>> +where + FROM: Send + Sync + 'static + Stage, + FROM::Output: Send + Sync + 'static, +{ + static TRACKER: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .i64_up_down_counter("indexer.pipeline.location") + .with_description("Track the number of tasks in the pipeline") + .with_unit("tasks") + .build() + }); + static RUNTIME_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_histogram("indexer.pipeline.duration") + .with_unit("ms") + .with_description("Pipeline job duration") + .with_boundaries(vec![ + 0.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0, 250.0, 500.0, 750.0, 1000.0, 2500.0, + 5000.0, 7500.0, 10000.0, 25000.0, 50000.0, 75000.0, 100000.0, 250000.0, 500000.0, + 750000.0, 1000000.0, 2500000.0, + ]) + .build() + }); + static COMPLETED: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.pipeline.completed") + .with_description("Pipelines finished") + .with_unit("tasks") + .build() + }); + static FAILED: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.pipeline.failed") + .with_description("Pipelines failed") + .with_unit("tasks") + .build() + }); + return |x: FROM| { + async move { + tokio::task::spawn(async move { + // Move from queued to active + if !FROM::FIRST { + TRACKER.add( + -1, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("state", "queued"), + ], + ); + } + TRACKER.add( + 1, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("state", "active"), + ], + ); + + // Run the stage + let before = std::time::Instant::now(); + let result = tokio::time::timeout( + tokio::time::Duration::from_secs(ARGS.pipeline_stage_timeout), + x.run(), + ) + .await; + let duration = before.elapsed(); + + // Move away from active + TRACKER.add( + -1, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("state", "active"), + ], + ); + + // Check if the stage timed out + let Ok(result) = result else { + error!( + "Pipeline stage {} timed out in {:02}. Please adjust the timeout", + FROM::NAME, + duration.as_millis() as u64 + ); + FAILED.add( + 1, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("reason", "timeout"), + ], + ); + RUNTIME_METRIC.record( + duration.as_millis() as u64, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("result", "timeout"), + ], + ); + return None; + }; + + // Check if the stage failed + let result = match result { + Err(error) => { + error!( + "Pipeline stage {} failed in {:02} with error: {}", + FROM::NAME, + duration.as_millis() as f64 / 1000.0, + error + ); + FAILED.add( + 1, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("reason", "error"), + ], + ); + RUNTIME_METRIC.record( + duration.as_millis() as u64, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("result", "error"), + ], + ); + // error!(target: "indexer", "Failed to index repo: {}", error); + return None; + } + Ok(result) => result, + }; + + // If we are done, we track as a completed pipeline. Otherwise track as queued for the next stage. + if !FROM::Output::DONE { + TRACKER.add( + 1, + &[ + KeyValue::new("stage", FROM::Output::NAME), + KeyValue::new("state", "queued"), + ], + ); + } else { + COMPLETED.add(1, &[]); + } + RUNTIME_METRIC.record( + duration.as_millis() as u64, + &[ + KeyValue::new("stage", FROM::NAME), + KeyValue::new("result", "ok"), + ], + ); + trace!( + "Pipeline stage {} finished in {:02}", + FROM::NAME, + duration.as_millis() as f64 / 1000.0 + ); + + return Some(result); + }) + .await + .expect("Failed to spawn task in a pump stage. This is a hard error and means that something is wrong with your system. Maybe go buy a bigger machine or something?") + } + .boxed() + }; +} diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index 0d62f15..3e3317f 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -7,14 +7,12 @@ use std::{ use futures::Stream; use serde::Deserialize; use surrealdb::{engine::any::Any, Surreal}; -use tracing::info; use crate::database::utils::unsafe_user_key_to_did; pub struct RepoStream { buffer: VecDeque, processed_dids: HashSet, - anchor: String, db: Surreal, db_future: Option< std::pin::Pin< @@ -30,11 +28,10 @@ struct LatestBackfill { } impl RepoStream { - pub fn new(anchor: String, db: Surreal) -> Self { + pub fn new(db: Surreal) -> Self { return Self { buffer: VecDeque::new(), processed_dids: HashSet::new(), - anchor, db, db_future: None, }; @@ -72,7 +69,6 @@ impl Stream for RepoStream { } eprintln!("RepoStream not ready, fetching more data"); - info!(target: "indexer", "Discovering follows starting from {}", self.anchor); if self.db_future.is_none() { self.db_future = Some( self.db From d95ba1c2ea42346c0b42624c815e1302b769f702 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 20:38:22 +0100 Subject: [PATCH 52/75] Refactor index_repo --- src/database/repo_indexer.rs | 4 +- src/database/repo_indexer/index_repo.rs | 593 +++++++----------------- src/database/repo_indexer/pipeline.rs | 16 +- 3 files changed, 165 insertions(+), 448 deletions(-) diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 69e43f2..3c81229 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -1,7 +1,7 @@ use super::connect; use crate::config::ARGS; use futures::{stream::FuturesUnordered, StreamExt}; -use index_repo::PipelineItem; +use index_repo::DownloadService; use pipeline::{create_stage, next_stage}; use repo_stream::RepoStream; use reqwest::Client; @@ -48,7 +48,7 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { // Create the processing pipeline let (mut output_receiver, _join_handle) = pumps::Pipeline::from_stream(dids) .filter_map( - create_stage(|(did, db, http_client)| PipelineItem::new(db, http_client, did)), + create_stage(|(did, db, http_client)| DownloadService::new(db, http_client, did)), unordered!(concurrent_elements), ) .backpressure(buffer_size) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index daccd81..b2b4c2e 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -1,7 +1,7 @@ +use super::pipeline::Stage; use crate::{ config::ARGS, database::{ - definitions::Record, handlers::{on_commit_event_createorupdate, BigUpdate}, repo_indexer::pipeline::NoNextStage, }, @@ -10,29 +10,14 @@ use atrium_api::{ record::KnownRecord, types::string::{Did, RecordKey}, }; -// use ipld_core::cid::{Cid, CidGeneric}; +use ipld_core::cid::Cid; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{ - collections::HashMap, - future::Future, - io::Read, - string::FromUtf8Error, - sync::LazyLock, - time::{Duration, Instant}, -}; -use surrealdb::{engine::any::Any, opt::PatchOp, RecordId, Surreal}; +use std::{collections::HashMap, time::Duration}; +use surrealdb::{engine::any::Any, Surreal}; use tokio::task::spawn_blocking; -use tracing::{info, instrument, span, trace, warn, Level, Span}; - -use super::pipeline::Stage; - -type Cid = ipld_core::cid::Cid; - -/// There should only be one request client to make use of connection pooling -// TODO: Dont use a global client -static REQWEST_CLIENT: LazyLock = LazyLock::new(|| Client::new()); +use tracing::{instrument, span, trace, warn, Level, Span}; #[derive(Deserialize, Debug)] #[allow(dead_code)] @@ -82,470 +67,202 @@ pub struct NodeData { pub entries: Vec, } -pub struct DatabaseUpdate { - collection: String, - rkey: RecordKey, - record: KnownRecord, -} - -/// Insert a file into a map -fn insert_into_map( - mut files: HashMap>, - file: (Cid, Vec), -) -> anyhow::Result>> { - let (cid, data) = file; - files.insert(cid, data); - Ok(files) -} - -/// Convert downloaded files into database updates. Blocks the thread +/// Convert downloaded files into a database update #[instrument(skip_all)] -fn files_to_updates_blocking( - files: HashMap>, -) -> Result, FromUtf8Error> { - // TODO: Understand this logic and whether this can be done streaming - let mut result = Vec::new(); - for file in &files { - let Ok(node_data) = from_reader::(&file.1[..]) else { - continue; - }; - let mut key = "".to_string(); - for entry in node_data.entries { - let k = String::from_utf8(entry.key_suffix)?; - key = format!("{}{}", key.split_at(entry.prefix_len as usize).0, k); - - let Some(block) = files.get(&entry.value) else { - continue; - }; - - let Ok(record) = from_reader::(&block[..]) else { - continue; - }; - - let mut parts = key.split("/"); - - let update = DatabaseUpdate { - collection: parts.next().unwrap().to_string(), - rkey: RecordKey::new(parts.next().unwrap().to_string()).unwrap(), - record, - }; - result.push(update); - } - } - return Ok(result); -} - -/// Get the plc response service for the repo -#[instrument(skip_all)] -async fn get_plc_service( - http_client: &Client, - did: &str, -) -> anyhow::Result> { - let resp = http_client - .get(format!("https://plc.directory/{}", did)) - .timeout(Duration::from_secs(ARGS.directory_download_timeout)) - .send() - .await? - .json::() - .await?; - let service = resp.service.into_iter().next(); - Ok(service) -} - -/// Download a repo from the given service -#[instrument(skip_all)] -async fn download_repo( - service: &PlcDirectoryDidResponseService, - did: &str, -) -> anyhow::Result> { - let get_repo_response = REQWEST_CLIENT - .get(format!( - "{}/xrpc/com.atproto.sync.getRepo?did={}", - service.service_endpoint, did, - )) - .timeout(tokio::time::Duration::from_secs(ARGS.repo_download_timeout)) - .send() - .await?; - let bytes = get_repo_response.bytes().await?.to_vec(); - info!( - "Downloaded repo {} with size {:.2} MB", - did, - bytes.len() as f64 / (1000.0 * 1000.0) - ); - return Ok(bytes); -} +fn convert_repo_to_update( + repo: Vec, + did: &String, + retrieval_time: surrealdb::sql::Datetime, +) -> anyhow::Result { + // Deserialize CAR file + let (entries, _) = rs_car_sync::car_read_all(&mut repo.as_slice(), true)?; -/// Download the file for the given repo into a map -#[instrument(skip_all)] -fn deserialize_repo(mut bytes: Vec) -> anyhow::Result>> { - let (entries, header) = rs_car_sync::car_read_all(&mut bytes.as_slice(), true)?; - // let car_reader = CarReader::new(bytes.as_ref()).await?; + // Store the entries in a hashmap for easier access let files = entries .into_iter() - .map(|(cid, data)| { - let cid_bytes = cid.to_bytes(); - let cid: Cid = ipld_core::cid::Cid::read_bytes(cid_bytes.as_slice()).unwrap(); - (cid, data) + .try_fold(HashMap::new(), |mut files, (cid, data)| { + let cid = Cid::read_bytes(cid.to_bytes().as_slice()).unwrap(); + files.insert(cid, data); + anyhow::Result::>>::Ok(files) + })?; + + // Create references to the files and the did, so we can use them in the closure + let files_ref = &files; + let did_key = &crate::database::utils::did_to_key(&did)?; + + let mut update = files_ref + .iter() + // Convert to NodeData + .filter_map(|(_, data)| from_reader::(&data[..]).ok()) + // Convert to Updates + .flat_map(|node_data| { + // TODO: Understand this logic + let mut key = "".to_string(); + node_data.entries.into_iter().filter_map(move |entry| { + let k = match String::from_utf8(entry.key_suffix) { + Ok(k) => k, + Err(e) => return Some(Err(anyhow::Error::from(e))), + }; + key = format!("{}{}", key.split_at(entry.prefix_len as usize).0, k); + + let block = files_ref.get(&entry.value)?; + let record = from_reader::(&block[..]).ok()?; + let mut parts = key.split("/"); + + let collection = parts.next()?.to_string(); + let rkey = RecordKey::new(parts.next()?.to_string()).ok()?; + let update = on_commit_event_createorupdate( + Did::new(did.clone().into()).unwrap(), + did_key.clone(), + collection, + rkey, + record, + ); + Some(update) + }) }) - .try_fold(HashMap::new(), insert_into_map); - - files -} - -/// Apply updates to the database -#[instrument(skip_all)] -fn create_big_update(did: &str, updates: Vec) -> anyhow::Result { - let did_key = crate::database::utils::did_to_key(did)?; - let did = did.to_owned(); - let did_key = did_key.to_owned(); - - let mut db_updates = updates.into_iter().map(|update| { - let did_key = did_key.clone(); - let did = did.to_string(); + // Merge the updates + .try_fold(BigUpdate::default(), |mut acc, update| { + acc.merge(update?); + anyhow::Result::::Ok(acc) + })?; - let res = on_commit_event_createorupdate( - Did::new(did.clone().into()).unwrap(), - did_key, - update.collection, - update.rkey, - update.record, - ); + // Add the timestamp of when we retrieved the repo to the update + update.add_timestamp(&did, retrieval_time); - match res { - Ok(big_update) => { - return Ok(big_update); - } - Err(e) => { - warn!("on_commit_event_createorupdate {} {}", e, did); - return Err(e); - } - } - }); - let mut really_big_update = BigUpdate::default(); - loop { - let Some(result) = db_updates.next() else { - break; - }; - match result { - Ok(big_update) => { - really_big_update.merge(big_update); - } - Err(e) => { - warn!("Failed to apply update: {}", e); - return Err(e); - } - } - } - Ok(really_big_update) + Ok(update) } -// /// Indexes the repo with the given DID (Decentralized Identifier) -// async fn index_repo(db: &Surreal, http_client: &Client, did: &String) -> anyhow::Result<()> { -// { -// if check_indexed(&db, &did).await? { -// return Ok(()); -// } -// } - -// let now = std::time::SystemTime::now() -// .duration_since(std::time::UNIX_EPOCH) -// .unwrap(); - -// let service = { -// let Some(service) = get_plc_service(&http_client, &did).await? else { -// return Ok(()); -// }; -// service -// }; - -// let repo = { download_repo(&service, &did).await? }; -// let files = { deserialize_repo(repo).await? }; - -// let updates = { files_to_updates(files).await? }; -// let update_result = { apply_updates(&db, &did, updates, &now).await? }; -// Ok(()) -// } - -/// No processing has been done on this item -pub struct New {} +pub struct CommonState { + db: Surreal, + http_client: Client, + did: String, + span: Span, +} -/// Has a service -pub struct WithService { +/// First pipeline stage +pub struct DownloadService { + common: CommonState, +} +/// Second pipeline stage +pub struct DownloadRepo { + common: CommonState, service: PlcDirectoryDidResponseService, - // TODO: Figure out why now is created this early - now: std::time::Duration, } -/// Has files -pub struct WithRepo { - now: std::time::Duration, +/// Third pipeline stage +pub struct ProcessRepo { + common: CommonState, repo: Vec, + retrieval_time: surrealdb::sql::Datetime, } - -/// Has converted the files to update -pub struct WithUpdates { - now: std::time::Duration, - pub update: BigUpdate, -} - -pub struct PipelineItem { - db: Surreal, - http_client: Client, - did: String, - span: Span, - pub state: State, +/// Fourth pipeline stage +pub struct ApplyUpdates { + common: CommonState, + update: BigUpdate, } -impl PipelineItem { - pub fn new(db: Surreal, http_client: Client, did: String) -> PipelineItem { +impl DownloadService { + pub fn new(db: Surreal, http_client: Client, did: String) -> DownloadService { let span = span!(target: "backfill", parent: None, Level::INFO, "pipeline_item"); span.record("did", did.clone()); span.in_scope(|| { trace!("Start backfilling repo"); }); - PipelineItem:: { - db, - http_client, - did, - span, - state: New {}, + DownloadService { + common: CommonState { + db, + http_client, + did, + span, + }, } } } -impl Stage for PipelineItem { - type Output = PipelineItem; +impl Stage for DownloadService { + type Next = DownloadRepo; const NAME: &str = "download_information"; - // type F = Future> + Send + Sync + 'static; - fn run(self) -> impl Future> + Send + Sync + 'static { - return async { - let service = get_plc_service(&self.http_client, &self.did).await?; - let Some(service) = service else { - // TODO: Handle this better, as this is not really an error - return Err(anyhow::anyhow!("Failed to get a plc service")); - }; - Ok(PipelineItem:: { - state: WithService { - service: service, - now: std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap(), - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - }; + async fn run(self) -> anyhow::Result { + let resp = self + .common + .http_client + .get(format!("https://plc.directory/{}", self.common.did)) + .timeout(Duration::from_secs(ARGS.directory_download_timeout)) + .send() + .await? + .json::() + .await?; + let service = resp.service.into_iter().next().ok_or(anyhow::anyhow!( + "Failed to get a plc service for {}", + self.common.did + ))?; + Ok(DownloadRepo { + service: service, + common: self.common, + }) } } -impl Stage for PipelineItem { - type Output = PipelineItem; +impl Stage for DownloadRepo { + type Next = ProcessRepo; const NAME: &str = "download_repo"; - // type F = Future> + Send + Sync + 'static; - fn run(self) -> impl Future> + Send + Sync + 'static { - return async move { - let repo = download_repo(&self.state.service, &self.did).await?; - Ok(PipelineItem:: { - state: WithRepo { - now: self.state.now, - repo, - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - }; + async fn run(self) -> anyhow::Result { + let retrival_time = surrealdb::sql::Datetime::from(chrono::Utc::now()); + let get_repo_response = self + .common + .http_client + .get(format!( + "{}/xrpc/com.atproto.sync.getRepo?did={}", + self.service.service_endpoint, self.common.did, + )) + .timeout(tokio::time::Duration::from_secs(ARGS.repo_download_timeout)) + .send() + .await?; + let repo: Vec = get_repo_response.bytes().await?.into(); + trace!( + "Downloaded repo {} with size {:.2} MB", + self.common.did, + repo.len() as f64 / (1000.0 * 1000.0) + ); + Ok(ProcessRepo { + repo, + common: self.common, + retrieval_time: retrival_time, + }) } } -impl Stage for PipelineItem { - type Output = PipelineItem; +impl Stage for ProcessRepo { + type Next = ApplyUpdates; const NAME: &str = "process_repo"; - // type F = Future> + Send + Sync + 'static; - - fn run(self) -> impl Future> + Send + Sync + 'static { - return async move { - // info!("Deserializing repo {}", self.did); - let did = self.did.clone(); - let big_update = spawn_blocking(move || { - let files: HashMap, Vec> = - deserialize_repo(self.state.repo)?; - let updates = files_to_updates_blocking(files)?; - let mut big_update = create_big_update(&did, updates)?; - big_update.add_timestamp(&did, surrealdb::sql::Datetime::from(chrono::Utc::now())); - Result::::Ok(big_update) - }) - .await??; + async fn run(self) -> anyhow::Result { + let did = self.common.did.clone(); + let big_update = + spawn_blocking(move || convert_repo_to_update(self.repo, &did, self.retrieval_time)) + .await??; - Ok(PipelineItem:: { - state: WithUpdates { - now: self.state.now, - update: big_update, - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - }; + Ok(ApplyUpdates { + update: big_update, + common: self.common, + }) } } -impl Stage for PipelineItem { - type Output = NoNextStage; +impl Stage for ApplyUpdates { + type Next = NoNextStage; const NAME: &str = "apply_updates"; // type F = Future> + Send + Sync + 'static; - fn run(self) -> impl Future> + Send + Sync + 'static { - return async move { - let start = Instant::now(); - - if !ARGS.dont_write_when_backfilling.unwrap_or(false) { - self.state.update.apply(&self.db).await?; - } else { - eprintln!("Skipping writing to the database and sleeping instead"); - std::thread::sleep(Duration::from_secs(1)); - } - let duration = start.elapsed(); - eprintln!("Big update took {:?}", duration); - warn!("Big update took {:?}", duration); - // let _: Option = &self - // .db - // .update(("latest_backfill", did_key.clone())) - // .patch(PatchOp::replace( - // "at", - // surrealdb::sql::Datetime::from(chrono::Utc::now()), - // )) - // .await?; - Ok(NoNextStage {}) - }; - } -} - -// impl Stage for PipelineItem { -// type Output = PipelineItem; -// const NAME: &str = "done"; -// // type F = Future> + Send + Sync + 'static; - -// fn run(self) -> impl Future> + Send + Sync + 'static { -// return async move { -// trace!("Indexed {}", self.did); -// Ok(self) -// }; -// } -// } - -// impl PipelineItem { -// #[instrument(skip(self), parent = &self.span)] -// pub async fn get_service(self) -> anyhow::Result> { -// let service = get_plc_service(&self.http_client, &self.did).await?; -// let Some(service) = service else { -// // TODO: Handle this better, as this is not really an error -// return Err(anyhow::anyhow!("Failed to get a plc service")); -// }; -// Ok(PipelineItem:: { -// state: WithService { -// service: service, -// now: std::time::SystemTime::now() -// .duration_since(std::time::UNIX_EPOCH) -// .unwrap(), -// }, -// db: self.db, -// http_client: self.http_client, -// did: self.did, -// span: self.span, -// }) -// } -// } - -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn download_repo(self) -> anyhow::Result> { - let repo = download_repo(&self.state.service, &self.did).await?; - Ok(PipelineItem:: { - state: WithRepo { - now: self.state.now, - repo, - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - } -} - -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn process_repo(self) -> anyhow::Result> { - // info!("Deserializing repo {}", self.did); - let did = self.did.clone(); - let big_update = spawn_blocking(move || { - let files: HashMap, Vec> = - deserialize_repo(self.state.repo)?; - let updates = files_to_updates_blocking(files)?; - let mut big_update = create_big_update(&did, updates)?; - - big_update.add_timestamp(&did, surrealdb::sql::Datetime::from(chrono::Utc::now())); - Result::::Ok(big_update) - }) - .await??; - - Ok(PipelineItem:: { - state: WithUpdates { - now: self.state.now, - update: big_update, - }, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - } -} - -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn apply_updates(self) -> anyhow::Result> { - let start = Instant::now(); - + async fn run(self) -> anyhow::Result { if !ARGS.dont_write_when_backfilling.unwrap_or(false) { - self.state.update.apply(&self.db).await?; + self.update.apply(&self.common.db, "backfill").await?; } else { - eprintln!("Skipping writing to the database and sleeping instead"); - std::thread::sleep(Duration::from_secs(1)); + warn!("Skipping writing to the database and sleeping instead"); + std::thread::sleep(Duration::from_secs(2)); } - let duration = start.elapsed(); - eprintln!("Big update took {:?}", duration); - warn!("Big update took {:?}", duration); - // let _: Option = &self - // .db - // .update(("latest_backfill", did_key.clone())) - // .patch(PatchOp::replace( - // "at", - // surrealdb::sql::Datetime::from(chrono::Utc::now()), - // )) - // .await?; - Ok(PipelineItem:: { - state: NoNextStage {}, - db: self.db, - http_client: self.http_client, - did: self.did, - span: self.span, - }) - } -} - -impl PipelineItem { - #[instrument(skip(self), parent = &self.span)] - pub async fn print_report(self) -> anyhow::Result<()> { - // TODO: This is only for printing debug stuff - trace!("Indexed {}", self.did); - Ok(()) + Ok(NoNextStage {}) } } diff --git a/src/database/repo_indexer/pipeline.rs b/src/database/repo_indexer/pipeline.rs index c3e5c0f..b096b6e 100644 --- a/src/database/repo_indexer/pipeline.rs +++ b/src/database/repo_indexer/pipeline.rs @@ -28,10 +28,10 @@ impl NextStage for NoNextStage { } pub trait Stage { - type Output: NextStage + Sync + Send + 'static; + type Next: NextStage + Sync + Send + 'static; const NAME: &'static str; const FIRST: bool = false; - fn run(self) -> impl Future> + Send + Sync + 'static; + fn run(self) -> impl Future> + Send + Sync + 'static; } pub struct FirstStage< @@ -49,10 +49,10 @@ impl< F: Fn(I) -> O + Sync + Send + 'static, > Stage for FirstStage { - type Output = O; + type Next = O; const NAME: &'static str = "First"; const FIRST: bool = true; - fn run(self) -> impl Future> + Send + Sync + 'static { + fn run(self) -> impl Future> + Send + Sync + 'static { async move { Ok((self.f)(self.a)) } } } @@ -78,10 +78,10 @@ pub fn create_stage< } pub fn next_stage( -) -> impl Fn(FROM) -> Pin> + Send + 'static>> +) -> impl Fn(FROM) -> Pin> + Send + 'static>> where FROM: Send + Sync + 'static + Stage, - FROM::Output: Send + Sync + 'static, + FROM::Next: Send + Sync + 'static, { static TRACKER: LazyLock> = LazyLock::new(|| { global::meter("indexer") @@ -209,11 +209,11 @@ where }; // If we are done, we track as a completed pipeline. Otherwise track as queued for the next stage. - if !FROM::Output::DONE { + if !FROM::Next::DONE { TRACKER.add( 1, &[ - KeyValue::new("stage", FROM::Output::NAME), + KeyValue::new("stage", FROM::Next::NAME), KeyValue::new("state", "queued"), ], ); From 9ab7bd05ebae9a652ecb8a57d1d428afecb427d9 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 20:38:40 +0100 Subject: [PATCH 53/75] Add metrics for database updates --- src/database/handlers.rs | 623 +++++++++++++++++++++++---------------- 1 file changed, 372 insertions(+), 251 deletions(-) diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 9bce49d..8dd24b3 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -11,14 +11,17 @@ use atrium_api::{ }; use chrono::Utc; use futures::FutureExt; +use opentelemetry::metrics::{Counter, Histogram}; +use opentelemetry::{global, Key, KeyValue}; use serde::{Deserialize, Serialize}; use serde_with::skip_serializing_none; use std::future::IntoFuture; +use std::sync::LazyLock; use std::time::Instant; use surrealdb::method::Query; use surrealdb::Datetime; use surrealdb::{engine::any::Any, RecordId, Surreal}; -use tracing::{instrument, span, warn, Instrument, Level}; +use tracing::{debug, instrument, span, trace, warn, Instrument, Level}; use crate::websocket::events::{Commit, Kind}; @@ -52,7 +55,7 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { } => { let big_update = on_commit_event_createorupdate(did, did_key, collection, rkey, record)?; - big_update.apply(db).await?; + big_update.apply(db, "jetstream").await?; } Commit::Delete { rev, @@ -271,6 +274,176 @@ struct WithId { data: R, } +static QUERY_DURATION_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_histogram("indexer.database.insert_duration") + .with_unit("ms") + .with_description("Big update duration") + .with_boundaries(vec![ + 0.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0, 250.0, 500.0, 750.0, 1000.0, 2500.0, 5000.0, + 7500.0, 10000.0, 25000.0, 50000.0, 75000.0, 100000.0, 250000.0, 500000.0, 750000.0, + 1000000.0, 2500000.0, + ]) + .build() +}); +static INSERTED_ROWS_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.inserted_elements") + .with_unit("rows") + .with_description("Inserted or updated rows") + .build() +}); +static INSERTED_SIZE_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.inserted_bytes") + .with_unit("By") + .with_description("Inserted or updated bytes (approximation)") + .build() +}); +static TRANSACTIONS_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.transactions") + .with_unit("By") + .with_description("Number of transactions") + .build() +}); + +struct BigUpdateInfoRow { + count: u64, + size: u64, +} +impl core::fmt::Debug for BigUpdateInfoRow { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_map() + .entry(&"count", &self.count) + .entry(&"mB", &(self.size as f64 / 1024.0 / 1024.0)) + .finish() + } +} + +struct BigUpdateInfo { + // Info about individual tables + did: BigUpdateInfoRow, + follows: BigUpdateInfoRow, + latest_backfills: BigUpdateInfoRow, + likes: BigUpdateInfoRow, + reposts: BigUpdateInfoRow, + blocks: BigUpdateInfoRow, + listblocks: BigUpdateInfoRow, + listitems: BigUpdateInfoRow, + feeds: BigUpdateInfoRow, + lists: BigUpdateInfoRow, + threadgates: BigUpdateInfoRow, + starterpacks: BigUpdateInfoRow, + postgates: BigUpdateInfoRow, + actordeclarations: BigUpdateInfoRow, + labelerservices: BigUpdateInfoRow, + quotes: BigUpdateInfoRow, + posts: BigUpdateInfoRow, + replies_relations: BigUpdateInfoRow, + reply_to_relations: BigUpdateInfoRow, + posts_relations: BigUpdateInfoRow, + overwrite_latest_backfills: BigUpdateInfoRow, +} + +impl BigUpdateInfo { + fn all_relations(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.likes.count + + self.reposts.count + + self.blocks.count + + self.listblocks.count + + self.listitems.count + + self.replies_relations.count + + self.reply_to_relations.count + + self.posts_relations.count + + self.quotes.count + + self.follows.count, + size: self.likes.size + + self.reposts.size + + self.blocks.size + + self.listblocks.size + + self.listitems.size + + self.replies_relations.size + + self.reply_to_relations.size + + self.posts_relations.size + + self.quotes.size + + self.follows.size, + } + } + fn all_tables(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.did.count + + self.feeds.count + + self.lists.count + + self.threadgates.count + + self.starterpacks.count + + self.postgates.count + + self.actordeclarations.count + + self.labelerservices.count + + self.posts.count, + size: self.did.size + + self.feeds.size + + self.lists.size + + self.threadgates.size + + self.starterpacks.size + + self.postgates.size + + self.actordeclarations.size + + self.labelerservices.size + + self.posts.size, + } + } + fn all(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.all_relations().count + self.all_tables().count, + size: self.all_relations().size + self.all_tables().size, + } + } + + fn record_metrics(&self, source: &str) { + INSERTED_ROWS_METRIC.add( + self.all().count, + &[KeyValue::new("source", source.to_string())], + ); + INSERTED_SIZE_METRIC.add( + self.all().size, + &[KeyValue::new("source", source.to_string())], + ); + TRANSACTIONS_METRIC.add(1, &[]); + } +} + +impl core::fmt::Debug for BigUpdateInfo { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_map() + .entry(&"did", &self.did) + .entry(&"follows", &self.follows) + .entry(&"latest_backfills", &self.latest_backfills) + .entry(&"likes", &self.likes) + .entry(&"reposts", &self.reposts) + .entry(&"blocks", &self.blocks) + .entry(&"listblocks", &self.listblocks) + .entry(&"listitems", &self.listitems) + .entry(&"feeds", &self.feeds) + .entry(&"lists", &self.lists) + .entry(&"threadgates", &self.threadgates) + .entry(&"starterpacks", &self.starterpacks) + .entry(&"postgates", &self.postgates) + .entry(&"actordeclarations", &self.actordeclarations) + .entry(&"labelerservices", &self.labelerservices) + .entry(&"quotes", &self.quotes) + .entry(&"posts", &self.posts) + .entry(&"replies_relations", &self.replies_relations) + .entry(&"reply_to_relations", &self.reply_to_relations) + .entry(&"posts_relations", &self.posts_relations) + .entry( + &"overwrite_latest_backfills", + &self.overwrite_latest_backfills, + ) + .finish() + } +} + #[derive(Default)] pub struct BigUpdate { /// Insert into did @@ -332,9 +505,15 @@ impl BigUpdate { }); } - pub async fn apply(self, db: &Surreal) -> Result<()> { - let format_output = tokio::task::block_in_place(|| format!("{:?}", &self)); - //TODO: Bundle this into a function + /// Apply this update to the database + /// + /// `source` is a string describing the source of the update, used for metrics + pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { + let start = Instant::now(); + // Convert the update to a string for logging later + let info = tokio::task::block_in_place(|| self.create_info()); + + // Create the query string let query_string = r#" BEGIN; INSERT IGNORE INTO did $dids RETURN NONE; @@ -361,7 +540,7 @@ impl BigUpdate { COMMIT; "#; - let before_update = Instant::now(); + // Create the update query. Does not take that long; ~50ms for 30000 rows let update = tokio::task::block_in_place(|| { db.query(query_string) .bind(("dids", self.did)) @@ -388,265 +567,207 @@ impl BigUpdate { .into_future() .instrument(span!(Level::INFO, "query")) }); - let duration = before_update.elapsed(); + + let preparation_duration = start.elapsed(); let after_update = Instant::now(); update.await?; let update_duration = after_update.elapsed(); - eprintln!( - "Update creation took {}ms, execution took {}ms; update: {}", - duration.as_millis(), + QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); + info.record_metrics(source); + + trace!( + "Applied updated: {} elements, {}MB, {:03}ms preparation, {:03}ms applying", + info.all().count, + info.all().size as f64 / 1024.0 / 1024.0, + preparation_duration.as_millis(), update_duration.as_millis(), - format_output ); + debug!("Detailed infos: {:?}", info); Ok(()) } + + fn create_info(&self) -> BigUpdateInfo { + BigUpdateInfo { + did: BigUpdateInfoRow { + count: self.did.len() as u64, + size: self + .did + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + follows: BigUpdateInfoRow { + count: self.follows.len() as u64, + size: self + .follows + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + latest_backfills: BigUpdateInfoRow { + count: self.latest_backfills.len() as u64, + size: self + .latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + likes: BigUpdateInfoRow { + count: self.likes.len() as u64, + size: self + .likes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + reposts: BigUpdateInfoRow { + count: self.reposts.len() as u64, + size: self + .reposts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + blocks: BigUpdateInfoRow { + count: self.blocks.len() as u64, + size: self + .blocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + listblocks: BigUpdateInfoRow { + count: self.listblocks.len() as u64, + size: self + .listblocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + listitems: BigUpdateInfoRow { + count: self.listitems.len() as u64, + size: self + .listitems + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + feeds: BigUpdateInfoRow { + count: self.feeds.len() as u64, + size: self + .feeds + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + lists: BigUpdateInfoRow { + count: self.lists.len() as u64, + size: self + .lists + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + threadgates: BigUpdateInfoRow { + count: self.threadgates.len() as u64, + size: self + .threadgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + starterpacks: BigUpdateInfoRow { + count: self.starterpacks.len() as u64, + size: self + .starterpacks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + postgates: BigUpdateInfoRow { + count: self.postgates.len() as u64, + size: self + .postgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + actordeclarations: BigUpdateInfoRow { + count: self.actordeclarations.len() as u64, + size: self + .actordeclarations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + labelerservices: BigUpdateInfoRow { + count: self.labelerservices.len() as u64, + size: self + .labelerservices + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + quotes: BigUpdateInfoRow { + count: self.quotes.len() as u64, + size: self + .quotes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + posts: BigUpdateInfoRow { + count: self.posts.len() as u64, + size: self + .posts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + replies_relations: BigUpdateInfoRow { + count: self.replies_relations.len() as u64, + size: self + .replies_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + reply_to_relations: BigUpdateInfoRow { + count: self.reply_to_relations.len() as u64, + size: self + .reply_to_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + posts_relations: BigUpdateInfoRow { + count: self.posts_relations.len() as u64, + size: self + .posts_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + overwrite_latest_backfills: BigUpdateInfoRow { + count: self.overwrite_latest_backfills.len() as u64, + size: self + .overwrite_latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + } + } } impl core::fmt::Debug for BigUpdate { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let did_size = self - .did - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let follows_size = self - .follows - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let latest_backfills_size = self - .latest_backfills - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let likes_size = self - .likes - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let reposts_size = self - .reposts - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let blocks_size = self - .blocks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let listblocks_size = self - .listblocks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let listitems_size = self - .listitems - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let feeds_size = self - .feeds - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let lists_size = self - .lists - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let threadgates_size = self - .threadgates - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let starterpacks_size = self - .starterpacks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let postgates_size = self - .postgates - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let actordeclarations_size = self - .actordeclarations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let labelerservices_size = self - .labelerservices - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let quotes_size = self - .quotes - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let posts_size = self - .posts - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let replies_relations_size = self - .replies_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let reply_to_relations_size = self - .reply_to_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let posts_relations_size = self - .posts_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let overwrite_latest_backfills_size = self - .overwrite_latest_backfills - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len()) - .sum::(); - let number_relations = self.follows.len() - + self.likes.len() - + self.reposts.len() - + self.blocks.len() - + self.listblocks.len() - + self.listitems.len() - + self.replies_relations.len() - + self.reply_to_relations.len() - + self.posts_relations.len() - + self.quotes.len(); - let number_inserts = self.did.len() - + self.latest_backfills.len() - + self.feeds.len() - + self.lists.len() - + self.threadgates.len() - + self.starterpacks.len() - + self.postgates.len() - + self.actordeclarations.len() - + self.labelerservices.len() - + self.posts.len() - + self.overwrite_latest_backfills.len(); - let number_total = number_relations + number_inserts; - let size_relations = replies_relations_size - + reply_to_relations_size - + posts_relations_size - + quotes_size - + likes_size - + reposts_size - + blocks_size - + listblocks_size - + listitems_size; - let size_inserts = did_size - + latest_backfills_size - + feeds_size - + lists_size - + threadgates_size - + starterpacks_size - + postgates_size - + actordeclarations_size - + labelerservices_size - + posts_size - + overwrite_latest_backfills_size; - let size_total = size_relations + size_inserts; - f.debug_struct("BigUpdate") - .field("updates", &number_total) - .field("updates_size_mb", &(size_total as f64 / 1024.0 / 1024.0)) - .field("number_relations", &number_relations) - .field("number_inserts", &number_inserts) - .field( - "size_relations_mb", - &(size_relations as f64 / 1024.0 / 1024.0), - ) - .field("size_inserts_mb", &(size_inserts as f64 / 1024.0 / 1024.0)) - .field("did", &self.did.len()) - .field("did_size_mb", &(did_size as f64 / 1024.0 / 1024.0)) - .field("follows", &self.follows.len()) - .field("follows_size_mb", &(follows_size as f64 / 1024.0 / 1024.0)) - .field("latest_backfills", &self.latest_backfills.len()) - .field( - "latest_backfills_size_mb", - &(latest_backfills_size as f64 / 1024.0 / 1024.0), - ) - .field("likes", &self.likes.len()) - .field("likes_size_mb", &(likes_size as f64 / 1024.0 / 1024.0)) - .field("reposts", &self.reposts.len()) - .field("reposts_size_mb", &(reposts_size as f64 / 1024.0 / 1024.0)) - .field("blocks", &self.blocks.len()) - .field("blocks_size_mb", &(blocks_size as f64 / 1024.0 / 1024.0)) - .field("listblocks", &self.listblocks.len()) - .field( - "listblocks_size_mb", - &(listblocks_size as f64 / 1024.0 / 1024.0), - ) - .field("listitems", &self.listitems.len()) - .field( - "listitems_size_mb", - &(listitems_size as f64 / 1024.0 / 1024.0), - ) - .field("feeds", &self.feeds.len()) - .field("feeds_size_mb", &(feeds_size as f64 / 1024.0 / 1024.0)) - .field("lists", &self.lists.len()) - .field("lists_size_mb", &(lists_size as f64 / 1024.0 / 1024.0)) - .field("threadgates", &self.threadgates.len()) - .field( - "threadgates_size_mb", - &(threadgates_size as f64 / 1024.0 / 1024.0), - ) - .field("starterpacks", &self.starterpacks.len()) - .field( - "starterpacks_size_mb", - &(starterpacks_size as f64 / 1024.0 / 1024.0), - ) - .field("postgates", &self.postgates.len()) - .field( - "postgates_size_mb", - &(postgates_size as f64 / 1024.0 / 1024.0), - ) - .field("actordeclarations", &self.actordeclarations.len()) - .field( - "actordeclarations_size_mb", - &(actordeclarations_size as f64 / 1024.0 / 1024.0), - ) - .field("labelerservices", &self.labelerservices.len()) - .field( - "labelerservices_size_mb", - &(labelerservices_size as f64 / 1024.0 / 1024.0), - ) - .field("quotes", &self.quotes.len()) - .field("quotes_size_mb", &(quotes_size as f64 / 1024.0 / 1024.0)) - .field("posts", &self.posts.len()) - .field("posts_size_mb", &(posts_size as f64 / 1024.0 / 1024.0)) - .field("replies_relations", &self.replies_relations.len()) - .field( - "replies_relations_size_mb", - &(replies_relations_size as f64 / 1024.0 / 1024.0), - ) - .field("reply_to_relations", &self.reply_to_relations.len()) - .field( - "reply_to_relations_size_mb", - &(reply_to_relations_size as f64 / 1024.0 / 1024.0), - ) - .field("posts_relations", &self.posts_relations.len()) - .field( - "posts_relations_size_mb", - &(posts_relations_size as f64 / 1024.0 / 1024.0), - ) - .field( - "overwrite_latest_backfills", - &self.overwrite_latest_backfills.len(), - ) - .field( - "overwrite_latest_backfills_size_mb", - &(overwrite_latest_backfills_size as f64 / 1024.0 / 1024.0), - ) - .finish() + let info = self.create_info(); + info.fmt(f) } } + /// If the new commit is a create or update, handle it #[instrument(skip(record))] pub fn on_commit_event_createorupdate( From a0616c2263fca64496019a23b6f222ed14e3e92a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:24:41 +0100 Subject: [PATCH 54/75] Add tracing to the pipeline steps --- src/database/repo_indexer/index_repo.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index b2b4c2e..e66d944 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -133,6 +133,7 @@ fn convert_repo_to_update( Ok(update) } +#[derive(Debug)] pub struct CommonState { db: Surreal, http_client: Client, @@ -141,21 +142,25 @@ pub struct CommonState { } /// First pipeline stage +#[derive(Debug)] pub struct DownloadService { common: CommonState, } /// Second pipeline stage +#[derive(Debug)] pub struct DownloadRepo { common: CommonState, service: PlcDirectoryDidResponseService, } /// Third pipeline stage +#[derive(Debug)] pub struct ProcessRepo { common: CommonState, repo: Vec, retrieval_time: surrealdb::sql::Datetime, } /// Fourth pipeline stage +#[derive(Debug)] pub struct ApplyUpdates { common: CommonState, update: BigUpdate, @@ -183,6 +188,7 @@ impl Stage for DownloadService { type Next = DownloadRepo; const NAME: &str = "download_information"; + #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { let resp = self .common @@ -208,6 +214,7 @@ impl Stage for DownloadRepo { type Next = ProcessRepo; const NAME: &str = "download_repo"; + #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { let retrival_time = surrealdb::sql::Datetime::from(chrono::Utc::now()); let get_repo_response = self @@ -238,6 +245,7 @@ impl Stage for ProcessRepo { type Next = ApplyUpdates; const NAME: &str = "process_repo"; + #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { let did = self.common.did.clone(); let big_update = @@ -254,8 +262,8 @@ impl Stage for ProcessRepo { impl Stage for ApplyUpdates { type Next = NoNextStage; const NAME: &str = "apply_updates"; - // type F = Future> + Send + Sync + 'static; + #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { if !ARGS.dont_write_when_backfilling.unwrap_or(false) { self.update.apply(&self.common.db, "backfill").await?; From 6dcacade0b841ad7038ba28522bbf4646611df79 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:25:05 +0100 Subject: [PATCH 55/75] Refactor RepoStream --- src/config.rs | 3 + src/database/repo_indexer/repo_stream.rs | 83 ++++++++++-------------- 2 files changed, 39 insertions(+), 47 deletions(-) diff --git a/src/config.rs b/src/config.rs index 21e70e3..621ea68 100644 --- a/src/config.rs +++ b/src/config.rs @@ -70,6 +70,9 @@ pub struct Args { /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used #[arg(long, default_value = "60")] pub directory_download_timeout: u64, + /// Number of DIDs the RepoStream should prefetch + #[arg(long, default_value = "10000")] + pub repo_stream_buffer_size: usize, } pub const ARGS: LazyLock = LazyLock::new(|| Args::parse()); diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index 3e3317f..ab86b9d 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -1,14 +1,13 @@ +use crate::{config::ARGS, database::utils::unsafe_user_key_to_did}; +use futures::Stream; +use serde::Deserialize; use std::{ collections::{HashSet, VecDeque}, future::{Future, IntoFuture}, task::Poll, }; - -use futures::Stream; -use serde::Deserialize; use surrealdb::{engine::any::Any, Surreal}; - -use crate::database::utils::unsafe_user_key_to_did; +use tracing::{error, trace}; pub struct RepoStream { buffer: VecDeque, @@ -21,6 +20,7 @@ pub struct RepoStream { >, } +#[allow(dead_code)] #[derive(Deserialize)] struct LatestBackfill { pub at: Option, @@ -38,24 +38,6 @@ impl RepoStream { } } -const FETCH_AMOUNT: usize = 10000; - -// async fn get_repos_from(db: &Surreal, anchor: &str) -> Vec { -// info!(target: "indexer", "Discovering follows starting from {}", anchor); -// let mut result = db -// // TODO: Fix the possible SQL injection -// .query(format!( -// "SELECT id,in,out FROM follow:{}.. LIMIT {};", -// anchor, FETCH_AMOUNT -// )); -// let follows: Vec = result.take(0)?; - -// let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { -// sleep(DISCOVERY_CAUGHT_UP_BACKOFF).await; -// continue; -// }; -// } - impl Stream for RepoStream { type Item = String; @@ -67,34 +49,44 @@ impl Stream for RepoStream { if let Some(next) = self.buffer.pop_front() { return Poll::Ready(Some(next)); } - eprintln!("RepoStream not ready, fetching more data"); - - if self.db_future.is_none() { - self.db_future = Some( - self.db + trace!("RepoStream buffer empty, fetching more data"); + + // Get a running query or create a new db query + let db_future = match &mut self.db_future { + Some(db_future) => db_future, + _ => { + let db_future = self + .db // TODO: Fix the possible SQL injection - .query(r#"SELECT of FROM latest_backfill WHERE at IS NONE LIMIT $limit;"#) - .bind(("limit", FETCH_AMOUNT)) + .query("SELECT of FROM latest_backfill WHERE at IS NONE LIMIT $limit;") + .bind(("limit", ARGS.repo_stream_buffer_size)) .into_owned() - .into_future(), - ); - } - let db_future = self.db_future.as_mut().unwrap(); + .into_future(); + self.db_future = Some(db_future); + self.db_future.as_mut().unwrap() + } + }; let Poll::Ready(result) = Future::poll(db_future.as_mut(), cx) else { return Poll::Pending; }; self.db_future = None; - let mut result = result.unwrap(); - - let follows: Vec = result.take(0).unwrap(); + let mut result = match result { + Ok(result) => result, + Err(err) => { + error!("RepoStream error: {:?}", err); + continue; + } + }; - // let Some(anchor_key) = follows.last().map(|follow| follow.id.key()) else { - // // TODO: Sleep again - // return Poll::Pending; - // }; - // self.anchor = format!("{}", anchor_key); + let follows: Vec = match result.take(0) { + Ok(follows) => follows, + Err(err) => { + error!("RepoStream database error: {:?}", err); + continue; + } + }; let starttime = std::time::Instant::now(); for latest_backfill in &follows { @@ -108,16 +100,13 @@ impl Stream for RepoStream { self.buffer.push_back(did); } let duration = starttime.elapsed(); - eprintln!( + trace!( "RepoStream processed {} records in {}ms", follows.len(), duration.as_millis() ); - if let Some(next) = self.buffer.pop_front() { - return Poll::Ready(Some(next)); - } - return Poll::Pending; + // Loop to see if we can return a value now } } } From a0f92f045ac563c675ca1e658db9de616bee1cfa Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:27:36 +0100 Subject: [PATCH 56/75] Clean up database/mod.ts --- src/database/mod.rs | 37 ++++++------------------------------ src/database/repo_indexer.rs | 18 ++++++++++-------- src/main.rs | 4 ++-- 3 files changed, 18 insertions(+), 41 deletions(-) diff --git a/src/database/mod.rs b/src/database/mod.rs index b3ef6ab..aa7f8d3 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,52 +1,27 @@ -use std::{sync::LazyLock, time::Duration}; - +use crate::config::ARGS; use anyhow::{Context, Result}; use definitions::{JetstreamCursor, Record}; -use surrealdb::{ - engine::{ - any::Any, - remote::ws::{Client, Ws}, - }, - opt::{ - auth::{Credentials, Root}, - Config, - }, - RecordId, Surreal, -}; -use tracing::{debug, info}; - -use crate::config::ARGS; +use surrealdb::{engine::any::Any, opt::auth::Root, RecordId, Surreal}; +use tracing::info; pub mod definitions; pub mod handlers; pub mod repo_indexer; mod utils; -static DB: LazyLock> = LazyLock::new(Surreal::init); - /// Connect to the database -pub async fn connect( - db_endpoint: &str, - username: &str, - password: &str, -) -> anyhow::Result> { +pub async fn connect(db_endpoint: &str) -> anyhow::Result> { // connect to the database info!(target: "indexer", "Connecting to the database at {}", db_endpoint); - // let db = Surreal::new::<_>(db_endpoint).await?; let db = surrealdb::engine::any::connect(db_endpoint) .with_capacity(ARGS.surrealdb_capacity) .await?; db.signin(Root { - username: "root", - password: "root", + username: &ARGS.username, + password: &ARGS.password, }) .await?; - // let config = Config::default().query_timeout(Duration::from_millis(1500)); - // let dbb = DB.connect::("127.0.0.1:8000", Op) - - // sign in to the server - definitions::init(&db) .await .context("Failed to initialize database schema")?; diff --git a/src/database/repo_indexer.rs b/src/database/repo_indexer.rs index 3c81229..55b684b 100644 --- a/src/database/repo_indexer.rs +++ b/src/database/repo_indexer.rs @@ -31,19 +31,21 @@ pub async fn start_full_repo_indexer(db: Surreal) -> anyhow::Result<()> { let databases = ARGS .db .iter() - .map(|x| async { connect(x, &ARGS.username, &ARGS.password).await.unwrap() }) + .map(|endpoint| async { connect(endpoint).await.unwrap() }) .collect::>() .collect::>() .await; // Create a stream of dids + captured database and http client - let dids = RepoStream::new(db.clone()).enumerate().map(move |(id, x)| { - ( - x.to_string(), - databases.get(id.rem(databases.len())).unwrap().clone(), - http_client.clone(), - ) - }); + let dids = RepoStream::new(db.clone()) + .enumerate() + .map(move |(id, did)| { + ( + did, + databases.get(id.rem(databases.len())).unwrap().clone(), + http_client.clone(), + ) + }); // Create the processing pipeline let (mut output_receiver, _join_handle) = pumps::Pipeline::from_stream(dids) diff --git a/src/main.rs b/src/main.rs index af802a6..10d48ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -67,8 +67,8 @@ fn main() { async fn application_main() -> anyhow::Result<()> { let _otel_guard = init_observability().await; - // connect to the database - let db = database::connect(&ARGS.db.first().unwrap(), &ARGS.username, &ARGS.password) + // Connect to the database + let db = database::connect(&ARGS.db.first().unwrap()) .await .context("Failed to connect to the database")?; From f4701b85e9cac14238f3fed37c0f66e6c9b591a7 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:40:33 +0100 Subject: [PATCH 57/75] Split BigUpdate into its own file --- src/database/big_update.rs | 1112 +++++++++++++++++++ src/database/definitions.rs | 2 +- src/database/handlers.rs | 1295 +---------------------- src/database/mod.rs | 1 + src/database/repo_indexer/index_repo.rs | 4 +- 5 files changed, 1126 insertions(+), 1288 deletions(-) create mode 100644 src/database/big_update.rs diff --git a/src/database/big_update.rs b/src/database/big_update.rs new file mode 100644 index 0000000..ec4660a --- /dev/null +++ b/src/database/big_update.rs @@ -0,0 +1,1112 @@ +use super::{ + definitions::{ + BskyPost, BskyPostImage, BskyPostMediaAspectRatio, BskyPostVideo, BskyPostVideoBlob, + }, + utils::{self, at_uri_to_record_id, blob_ref_to_record_id, did_to_key}, +}; +use anyhow::Result; +use atrium_api::app::bsky::richtext::facet::MainFeaturesItem; +use atrium_api::types::Object; +use atrium_api::{ + app::bsky::embed::video, + record::KnownRecord, + types::{ + string::{Did, RecordKey}, + Blob, BlobRef, + }, +}; +use chrono::Utc; +use opentelemetry::metrics::{Counter, Histogram}; +use opentelemetry::{global, KeyValue}; +use serde::Serialize; +use serde_with::skip_serializing_none; +use std::future::IntoFuture; +use std::sync::LazyLock; +use std::time::Instant; +use surrealdb::Datetime; +use surrealdb::{engine::any::Any, RecordId, Surreal}; +use tracing::{debug, instrument, span, trace, warn, Instrument, Level}; + +#[derive(Serialize)] +struct UpdateFollow { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateLike { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateRepost { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateBlock { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateListBlock { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[derive(Serialize)] +struct UpdateListItem { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, + #[serde(rename = "createdAt")] + pub created_at: surrealdb::Datetime, +} + +#[skip_serializing_none] +#[derive(Serialize)] +struct UpdateLatestBackfill { + of: surrealdb::RecordId, + id: String, + at: Option, +} + +/// Database struct for a bluesky profile +#[derive(Debug, Serialize)] +#[allow(dead_code)] +pub struct UpdateDid { + pub id: String, + #[serde(rename = "displayName")] + pub display_name: Option, + pub description: Option, + pub avatar: Option, + pub banner: Option, + #[serde(rename = "createdAt")] + pub created_at: Option, + #[serde(rename = "seenAt")] + pub seen_at: Datetime, + #[serde(rename = "joinedViaStarterPack")] + pub joined_via_starter_pack: Option, + pub labels: Option>, + #[serde(rename = "pinnedPost")] + pub pinned_post: Option, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Serialize)] +pub struct UpdateFeed { + pub id: String, + pub uri: String, + pub author: RecordId, + pub rkey: String, + pub did: String, + #[serde(rename = "displayName")] + pub display_name: String, + pub description: Option, + pub avatar: Option, + #[serde(rename = "createdAt")] + pub created_at: Datetime, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Debug, Serialize)] +pub struct UpdateList { + pub id: String, + pub name: String, + pub purpose: String, + #[serde(rename = "createdAt")] + pub created_at: Datetime, + pub description: Option, + pub avatar: Option, + pub labels: Option>, + #[serde(rename = "extraData")] + pub extra_data: Option, +} + +#[derive(Serialize)] +struct UpdateQuote { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdateRepliesRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdateReplyToRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct UpdatePostsRelation { + #[serde(rename = "in")] + pub from: surrealdb::RecordId, + #[serde(rename = "out")] + pub to: surrealdb::RecordId, + pub id: String, +} + +#[derive(Serialize)] +struct WithId { + id: String, + #[serde(flatten)] + data: R, +} + +static QUERY_DURATION_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_histogram("indexer.database.insert_duration") + .with_unit("ms") + .with_description("Big update duration") + .with_boundaries(vec![ + 0.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0, 250.0, 500.0, 750.0, 1000.0, 2500.0, 5000.0, + 7500.0, 10000.0, 25000.0, 50000.0, 75000.0, 100000.0, 250000.0, 500000.0, 750000.0, + 1000000.0, 2500000.0, + ]) + .build() +}); +static INSERTED_ROWS_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.inserted_elements") + .with_unit("rows") + .with_description("Inserted or updated rows") + .build() +}); +static INSERTED_SIZE_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.inserted_bytes") + .with_unit("By") + .with_description("Inserted or updated bytes (approximation)") + .build() +}); +static TRANSACTIONS_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.transactions") + .with_unit("By") + .with_description("Number of transactions") + .build() +}); + +struct BigUpdateInfoRow { + count: u64, + size: u64, +} +impl core::fmt::Debug for BigUpdateInfoRow { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_map() + .entry(&"count", &self.count) + .entry(&"mB", &(self.size as f64 / 1024.0 / 1024.0)) + .finish() + } +} + +struct BigUpdateInfo { + // Info about individual tables + did: BigUpdateInfoRow, + follows: BigUpdateInfoRow, + latest_backfills: BigUpdateInfoRow, + likes: BigUpdateInfoRow, + reposts: BigUpdateInfoRow, + blocks: BigUpdateInfoRow, + listblocks: BigUpdateInfoRow, + listitems: BigUpdateInfoRow, + feeds: BigUpdateInfoRow, + lists: BigUpdateInfoRow, + threadgates: BigUpdateInfoRow, + starterpacks: BigUpdateInfoRow, + postgates: BigUpdateInfoRow, + actordeclarations: BigUpdateInfoRow, + labelerservices: BigUpdateInfoRow, + quotes: BigUpdateInfoRow, + posts: BigUpdateInfoRow, + replies_relations: BigUpdateInfoRow, + reply_to_relations: BigUpdateInfoRow, + posts_relations: BigUpdateInfoRow, + overwrite_latest_backfills: BigUpdateInfoRow, +} + +impl BigUpdateInfo { + fn all_relations(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.likes.count + + self.reposts.count + + self.blocks.count + + self.listblocks.count + + self.listitems.count + + self.replies_relations.count + + self.reply_to_relations.count + + self.posts_relations.count + + self.quotes.count + + self.follows.count, + size: self.likes.size + + self.reposts.size + + self.blocks.size + + self.listblocks.size + + self.listitems.size + + self.replies_relations.size + + self.reply_to_relations.size + + self.posts_relations.size + + self.quotes.size + + self.follows.size, + } + } + fn all_tables(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.did.count + + self.feeds.count + + self.lists.count + + self.threadgates.count + + self.starterpacks.count + + self.postgates.count + + self.actordeclarations.count + + self.labelerservices.count + + self.posts.count, + size: self.did.size + + self.feeds.size + + self.lists.size + + self.threadgates.size + + self.starterpacks.size + + self.postgates.size + + self.actordeclarations.size + + self.labelerservices.size + + self.posts.size, + } + } + fn all(&self) -> BigUpdateInfoRow { + BigUpdateInfoRow { + count: self.all_relations().count + self.all_tables().count, + size: self.all_relations().size + self.all_tables().size, + } + } + + fn record_metrics(&self, source: &str) { + INSERTED_ROWS_METRIC.add( + self.all().count, + &[KeyValue::new("source", source.to_string())], + ); + INSERTED_SIZE_METRIC.add( + self.all().size, + &[KeyValue::new("source", source.to_string())], + ); + TRANSACTIONS_METRIC.add(1, &[]); + } +} + +impl core::fmt::Debug for BigUpdateInfo { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_map() + .entry(&"did", &self.did) + .entry(&"follows", &self.follows) + .entry(&"latest_backfills", &self.latest_backfills) + .entry(&"likes", &self.likes) + .entry(&"reposts", &self.reposts) + .entry(&"blocks", &self.blocks) + .entry(&"listblocks", &self.listblocks) + .entry(&"listitems", &self.listitems) + .entry(&"feeds", &self.feeds) + .entry(&"lists", &self.lists) + .entry(&"threadgates", &self.threadgates) + .entry(&"starterpacks", &self.starterpacks) + .entry(&"postgates", &self.postgates) + .entry(&"actordeclarations", &self.actordeclarations) + .entry(&"labelerservices", &self.labelerservices) + .entry(&"quotes", &self.quotes) + .entry(&"posts", &self.posts) + .entry(&"replies_relations", &self.replies_relations) + .entry(&"reply_to_relations", &self.reply_to_relations) + .entry(&"posts_relations", &self.posts_relations) + .entry( + &"overwrite_latest_backfills", + &self.overwrite_latest_backfills, + ) + .finish() + } +} + +#[derive(Default)] +pub struct BigUpdate { + /// Insert into did + did: Vec, + follows: Vec, + latest_backfills: Vec, + /// Like latest_backfills but overwrites existing records + overwrite_latest_backfills: Vec, + likes: Vec, + reposts: Vec, + blocks: Vec, + listblocks: Vec, + listitems: Vec, + feeds: Vec, + lists: Vec, + threadgates: Vec>>>, + starterpacks: Vec>>>, + postgates: Vec>>>, + actordeclarations: + Vec>>>, + labelerservices: Vec>>>, + quotes: Vec, + posts: Vec>, + replies_relations: Vec, + reply_to_relations: Vec, + posts_relations: Vec, +} +impl BigUpdate { + pub fn merge(&mut self, other: BigUpdate) { + self.did.extend(other.did); + self.follows.extend(other.follows); + self.latest_backfills.extend(other.latest_backfills); + self.likes.extend(other.likes); + self.reposts.extend(other.reposts); + self.blocks.extend(other.blocks); + self.listblocks.extend(other.listblocks); + self.listitems.extend(other.listitems); + self.feeds.extend(other.feeds); + self.lists.extend(other.lists); + self.threadgates.extend(other.threadgates); + self.starterpacks.extend(other.starterpacks); + self.postgates.extend(other.postgates); + self.actordeclarations.extend(other.actordeclarations); + self.labelerservices.extend(other.labelerservices); + self.quotes.extend(other.quotes); + self.posts.extend(other.posts); + self.replies_relations.extend(other.replies_relations); + self.reply_to_relations.extend(other.reply_to_relations); + self.posts_relations.extend(other.posts_relations); + self.overwrite_latest_backfills + .extend(other.overwrite_latest_backfills); + } + + pub fn add_timestamp(&mut self, did: &str, time: surrealdb::sql::Datetime) { + self.overwrite_latest_backfills.push(UpdateLatestBackfill { + of: RecordId::from(("did", did)), + id: did.to_string(), + at: Some(time), + }); + } + + /// Apply this update to the database + /// + /// `source` is a string describing the source of the update, used for metrics + pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { + let start = Instant::now(); + // Convert the update to a string for logging later + let info = tokio::task::block_in_place(|| self.create_info()); + + // Create the query string + let query_string = r#" + BEGIN; + INSERT IGNORE INTO did $dids RETURN NONE; + INSERT IGNORE INTO latest_backfill $latest_backfills RETURN NONE; + INSERT IGNORE INTO feed $feeds RETURN NONE; + INSERT IGNORE INTO list $lists RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates RETURN NONE; + INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations RETURN NONE; + INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices RETURN NONE; + INSERT IGNORE INTO posts $posts RETURN NONE; + INSERT RELATION INTO quotes $quotes RETURN NONE; + INSERT RELATION INTO like $likes RETURN NONE; + INSERT RELATION INTO repost $reposts RETURN NONE; + INSERT RELATION INTO block $blocks RETURN NONE; + INSERT RELATION INTO listblock $listblocks RETURN NONE; + INSERT RELATION INTO listitem $listitems RETURN NONE; + INSERT RELATION INTO replyto $reply_to_relations RETURN NONE; + INSERT RELATION INTO quotes $quotes RETURN NONE; + INSERT RELATION INTO replies $replies_relations RETURN NONE; + INSERT RELATION INTO follow $follows RETURN NONE; + INSERT INTO latest_backfill $overwrite_latest_backfill RETURN NONE; + COMMIT; + "#; + + // Create the update query. Does not take that long; ~50ms for 30000 rows + let update = tokio::task::block_in_place(|| { + db.query(query_string) + .bind(("dids", self.did)) + .bind(("follows", self.follows)) + .bind(("latest_backfills", self.latest_backfills)) + .bind(("likes", self.likes)) + .bind(("reposts", self.reposts)) + .bind(("blocks", self.blocks)) + .bind(("listblocks", self.listblocks)) + .bind(("listitems", self.listitems)) + .bind(("feeds", self.feeds)) + .bind(("lists", self.lists)) + .bind(("threadgates", self.threadgates)) + .bind(("starterpacks", self.starterpacks)) + .bind(("postgates", self.postgates)) + .bind(("actordeclarations", self.actordeclarations)) + .bind(("labelerservices", self.labelerservices)) + .bind(("quotes", self.quotes)) + .bind(("posts", self.posts)) + .bind(("replies_relations", self.replies_relations)) + .bind(("reply_to_relations", self.reply_to_relations)) + .bind(("posts_relations", self.posts_relations)) + .bind(("overwrite_latest_backfill", self.overwrite_latest_backfills)) + .into_future() + .instrument(span!(Level::INFO, "query")) + }); + + let preparation_duration = start.elapsed(); + let after_update = Instant::now(); + update.await?; + let update_duration = after_update.elapsed(); + QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); + info.record_metrics(source); + + trace!( + "Applied updated: {} elements, {}MB, {:03}ms preparation, {:03}ms applying", + info.all().count, + info.all().size as f64 / 1024.0 / 1024.0, + preparation_duration.as_millis(), + update_duration.as_millis(), + ); + debug!("Detailed infos: {:?}", info); + + Ok(()) + } + + fn create_info(&self) -> BigUpdateInfo { + BigUpdateInfo { + did: BigUpdateInfoRow { + count: self.did.len() as u64, + size: self + .did + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + follows: BigUpdateInfoRow { + count: self.follows.len() as u64, + size: self + .follows + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + latest_backfills: BigUpdateInfoRow { + count: self.latest_backfills.len() as u64, + size: self + .latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + likes: BigUpdateInfoRow { + count: self.likes.len() as u64, + size: self + .likes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + reposts: BigUpdateInfoRow { + count: self.reposts.len() as u64, + size: self + .reposts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + blocks: BigUpdateInfoRow { + count: self.blocks.len() as u64, + size: self + .blocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + listblocks: BigUpdateInfoRow { + count: self.listblocks.len() as u64, + size: self + .listblocks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + listitems: BigUpdateInfoRow { + count: self.listitems.len() as u64, + size: self + .listitems + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + feeds: BigUpdateInfoRow { + count: self.feeds.len() as u64, + size: self + .feeds + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + lists: BigUpdateInfoRow { + count: self.lists.len() as u64, + size: self + .lists + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + threadgates: BigUpdateInfoRow { + count: self.threadgates.len() as u64, + size: self + .threadgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + starterpacks: BigUpdateInfoRow { + count: self.starterpacks.len() as u64, + size: self + .starterpacks + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + postgates: BigUpdateInfoRow { + count: self.postgates.len() as u64, + size: self + .postgates + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + actordeclarations: BigUpdateInfoRow { + count: self.actordeclarations.len() as u64, + size: self + .actordeclarations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + labelerservices: BigUpdateInfoRow { + count: self.labelerservices.len() as u64, + size: self + .labelerservices + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + quotes: BigUpdateInfoRow { + count: self.quotes.len() as u64, + size: self + .quotes + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + posts: BigUpdateInfoRow { + count: self.posts.len() as u64, + size: self + .posts + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + replies_relations: BigUpdateInfoRow { + count: self.replies_relations.len() as u64, + size: self + .replies_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + reply_to_relations: BigUpdateInfoRow { + count: self.reply_to_relations.len() as u64, + size: self + .reply_to_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + posts_relations: BigUpdateInfoRow { + count: self.posts_relations.len() as u64, + size: self + .posts_relations + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + overwrite_latest_backfills: BigUpdateInfoRow { + count: self.overwrite_latest_backfills.len() as u64, + size: self + .overwrite_latest_backfills + .iter() + .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) + .sum(), + }, + } + } +} + +impl core::fmt::Debug for BigUpdate { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let info = self.create_info(); + info.fmt(f) + } +} + +/// If the new commit is a create or update, handle it +#[instrument(skip(record))] +pub fn create_big_update( + did: Did, + did_key: String, + collection: String, + rkey: RecordKey, + record: KnownRecord, +) -> Result { + utils::ensure_valid_rkey(rkey.to_string())?; + + let mut big_update = BigUpdate::default(); + + match record { + KnownRecord::AppBskyActorProfile(d) => { + // NOTE: using .ok() here isn't optimal, incorrect data should + // probably not be entered into the database at all, but for now + // we'll just ignore it. + let profile = UpdateDid { + id: did_key.clone(), + display_name: d.display_name.clone(), + description: d.description.clone(), + avatar: None, // TODO Implement + banner: None, // TODO Implement + created_at: d + .created_at + .as_ref() + .and_then(|dt| utils::extract_dt(dt).ok()), + seen_at: Utc::now().into(), + joined_via_starter_pack: d + .joined_via_starter_pack + .as_ref() + .and_then(|d| utils::strong_ref_to_record_id(d).ok()), + // TODO if strong_ref_to_record_id fails, it should return an error result instead of being empty + pinned_post: d + .pinned_post + .as_ref() + .and_then(|d| utils::strong_ref_to_record_id(d).ok()), + labels: d + .labels + .as_ref() + .and_then(|d| utils::extract_self_labels_profile(d)), + extra_data: process_extra_data(&d.extra_data)?, + }; + big_update.did.push(profile); + } + KnownRecord::AppBskyGraphFollow(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + let to = utils::did_to_key(d.subject.as_str())?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.follows.push(UpdateFollow { + from: RecordId::from(("did", from)), + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); + + big_update.latest_backfills.push(UpdateLatestBackfill { + of: RecordId::from(("did", to.clone())), + id: to, + at: None, + }); + } + KnownRecord::AppBskyFeedLike(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + let to = utils::at_uri_to_record_id(&d.subject.uri)?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.likes.push(UpdateLike { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); + } + KnownRecord::AppBskyFeedRepost(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + let to = utils::at_uri_to_record_id(&d.subject.uri)?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.reposts.push(UpdateRepost { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); + } + KnownRecord::AppBskyGraphBlock(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + let to = utils::did_to_key(d.subject.as_str())?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.blocks.push(UpdateBlock { + from: RecordId::from(("did", from)), + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); + } + KnownRecord::AppBskyGraphListblock(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + let to = utils::at_uri_to_record_id(&d.subject)?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.listblocks.push(UpdateListBlock { + from: RecordId::from(("did", from)), + to: to, + id: id, + created_at, + }); + } + KnownRecord::AppBskyGraphListitem(d) => { + // TODO ensure_valid_rkey_strict(rkey.as_str())?; + let from = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), from); + + let from = utils::at_uri_to_record_id(&d.list)?; + let to = utils::did_to_key(&d.subject)?; + let created_at = utils::extract_dt(&d.created_at)?; + + big_update.listitems.push(UpdateListItem { + from: from, + to: RecordId::from(("did", to.clone())), + id: id, + created_at, + }); + } + KnownRecord::AppBskyFeedGenerator(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + let feed = UpdateFeed { + id: id, + author: RecordId::from_table_key("did", did_key), + avatar: None, // TODO implement + created_at: utils::extract_dt(&d.created_at)?, + description: d.description.clone(), + did: d.did.to_string(), + display_name: d.display_name.clone(), + rkey: rkey.to_string(), + uri: format!( + "at://{}/app.bsky.feed.generator/{}", + did.as_str(), + rkey.as_str() + ), + extra_data: process_extra_data(&d.extra_data)?, + }; + big_update.feeds.push(feed); + } + KnownRecord::AppBskyGraphList(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + + let list = UpdateList { + id: id, + name: d.name.clone(), + avatar: None, // TODO implement + created_at: utils::extract_dt(&d.created_at)?, + description: d.description.clone(), + labels: d + .labels + .as_ref() + .and_then(|d| utils::extract_self_labels_list(d)), + purpose: d.purpose.clone(), + extra_data: process_extra_data(&d.extra_data)?, + }; + big_update.lists.push(list); + } + KnownRecord::AppBskyFeedThreadgate(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + big_update.threadgates.push(WithId { id: id, data: d }); + } + KnownRecord::AppBskyGraphStarterpack(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + big_update.starterpacks.push(WithId { id: id, data: d }); + } + KnownRecord::AppBskyFeedPostgate(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + big_update.postgates.push(WithId { id: id, data: d }); + } + KnownRecord::ChatBskyActorDeclaration(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + big_update + .actordeclarations + .push(WithId { id: id, data: d }); + } + KnownRecord::AppBskyLabelerService(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + big_update.labelerservices.push(WithId { id: id, data: d }); + } + KnownRecord::AppBskyFeedPost(d) => { + let did_key = utils::did_to_key(did.as_str())?; + let id = format!("{}_{}", rkey.as_str(), did_key); + + let mut images: Vec = vec![]; + let mut links: Vec = vec![]; + let mut mentions: Vec = vec![]; + let mut record: Option = None; + let mut tags: Vec = vec![]; + let mut video: Option = None; + + let mut post_images: Vec = vec![]; + + match &d.embed { + Some(d) => { + match d { + atrium_api::types::Union::Refs(e) => { + match e { + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedExternalMain(m)=>{ + // TODO index preview too + links.push(m.external.uri.clone()); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedImagesMain(m) => { + post_images=m.images.clone(); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedVideoMain(m) => { + video = Some(process_video(m)?); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordMain(m) => { + record = Some(at_uri_to_record_id(&m.record.uri)?); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordWithMediaMain(m) => { + record = Some(at_uri_to_record_id(&m.record.record.uri)?); + + match &m.media{ + atrium_api::types::Union::Refs(r)=>match r{ + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedExternalMain(m)=>{ + // TODO index preview too + links.push(m.external.uri.clone()); + } + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedImagesMain(m)=>{ + post_images=m.images.clone(); + } + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedVideoMain(m)=>{ + + video = Some(process_video(m)?); + } + } + atrium_api::types::Union::Unknown(_)=>{} + } + }, + } + } + atrium_api::types::Union::Unknown(_) => {} + } + } + None => {} + }; + + if !post_images.is_empty() { + for i in post_images { + images.push(BskyPostImage { + alt: i.alt.clone(), + blob: blob_ref_to_record_id(&i.image), // TODO store blob details + aspect_ratio: i.aspect_ratio.as_ref().map(|a| BskyPostMediaAspectRatio { + height: a.height.into(), + width: a.width.into(), + }), + }) + } + } + + if let Some(facets) = &d.facets { + for facet in facets { + for feature in &facet.features { + match feature { + atrium_api::types::Union::Refs(refs) => match refs { + MainFeaturesItem::Mention(m) => { + mentions.push(("did", did_to_key(m.did.as_str())?).into()); + } + MainFeaturesItem::Link(l) => { + links.push(l.uri.clone()); + } + MainFeaturesItem::Tag(t) => { + tags.push(t.tag.clone()); + } + }, + atrium_api::types::Union::Unknown(_) => {} + } + } + } + } + + if let Some(t) = &d.tags { + tags.extend(t.clone()); + } + + if let Some(r) = &record { + if r.table() == "post" { + big_update.quotes.push(UpdateQuote { + from: RecordId::from_table_key("post", id.clone()), + to: r.clone(), + id: id.clone(), + }); + } + } + + let post = WithId { + id: id.clone(), + data: BskyPost { + author: RecordId::from_table_key("did", did_key.clone()), + bridgy_original_url: None, + via: None, + created_at: utils::extract_dt(&d.created_at)?, + labels: d + .labels + .as_ref() + .and_then(|d| utils::extract_self_labels_post(d)), + text: d.text.clone(), + langs: d + .langs + .as_ref() + .map(|d| d.iter().map(|l| l.as_ref().to_string()).collect()), + root: d + .reply + .as_ref() + .map(|r| utils::strong_ref_to_record_id(&r.root)) + .transpose()?, + parent: d + .reply + .as_ref() + .map(|r| utils::strong_ref_to_record_id(&r.parent)) + .transpose()?, + video: video, + tags: if tags.is_empty() { None } else { Some(tags) }, + links: if links.is_empty() { None } else { Some(links) }, + mentions: if mentions.is_empty() { + None + } else { + Some(mentions) + }, + record: record, + images: if images.is_empty() { + None + } else { + Some(images) + }, + extra_data: process_extra_data(&d.extra_data)?, + }, + }; + + let parent = post.data.parent.clone(); + big_update.posts.push(post); + + if parent.is_some() { + big_update.replies_relations.push(UpdateRepliesRelation { + from: RecordId::from_table_key("did", did_key.clone()), + to: RecordId::from_table_key("post", id.clone()), + id: id.clone(), + }); + + big_update.reply_to_relations.push(UpdateReplyToRelation { + from: RecordId::from_table_key("post", id.clone()), + to: parent.unwrap(), + id: id.clone(), + }); + } else { + big_update.posts_relations.push(UpdatePostsRelation { + from: RecordId::from_table_key("did", did_key.clone()), + to: RecordId::from_table_key("post", id.clone()), + id: id.clone(), + }); + } + } + _ => { + warn!(target: "indexer", "ignored create_or_update {} {} {}", + did.as_str(), collection, rkey.as_str()); + } + } + + Ok(big_update) +} + +fn process_video(vid: &video::Main) -> Result { + let blob = extract_video_blob(&vid.video)?; + let v = BskyPostVideo { + alt: vid.alt.clone(), + aspect_ratio: vid.aspect_ratio.clone().map(|a| BskyPostMediaAspectRatio { + height: a.height.into(), + width: a.width.into(), + }), + blob: BskyPostVideoBlob { + cid: blob.r#ref.0.to_string(), + media_type: blob.mime_type, + size: blob.size as u64, + }, + captions: None, // TODO implement + }; + Ok(v) +} +fn extract_video_blob(blob: &BlobRef) -> Result { + match blob { + atrium_api::types::BlobRef::Typed(a) => match a { + atrium_api::types::TypedBlobRef::Blob(b) => Ok(b.clone()), + }, + atrium_api::types::BlobRef::Untyped(_) => anyhow::bail!("Invalid blob ref type"), + } +} + +fn process_extra_data(ipld: &ipld_core::ipld::Ipld) -> Result> { + let str = simd_json::serde::to_string(ipld)?; + Ok(if str == "{}" { None } else { Some(str) }) +} diff --git a/src/database/definitions.rs b/src/database/definitions.rs index cb1841f..16547f8 100644 --- a/src/database/definitions.rs +++ b/src/database/definitions.rs @@ -1,7 +1,7 @@ use anyhow::Context; use serde::{Deserialize, Serialize}; use surrealdb::{engine::any::Any, Datetime, RecordId, Surreal}; -use tracing::{debug, info}; +use tracing::debug; /// Database struct for a bluesky profile #[derive(Debug, Serialize)] diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 8dd24b3..3647ae5 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -1,38 +1,13 @@ -use anyhow::Result; -use atrium_api::app::bsky::richtext::facet::MainFeaturesItem; -use atrium_api::types::Object; -use atrium_api::{ - app::bsky::embed::video, - record::KnownRecord, - types::{ - string::{Did, RecordKey}, - Blob, BlobRef, - }, -}; -use chrono::Utc; -use futures::FutureExt; -use opentelemetry::metrics::{Counter, Histogram}; -use opentelemetry::{global, Key, KeyValue}; -use serde::{Deserialize, Serialize}; -use serde_with::skip_serializing_none; -use std::future::IntoFuture; -use std::sync::LazyLock; -use std::time::Instant; -use surrealdb::method::Query; -use surrealdb::Datetime; -use surrealdb::{engine::any::Any, RecordId, Surreal}; -use tracing::{debug, instrument, span, trace, warn, Instrument, Level}; - -use crate::websocket::events::{Commit, Kind}; - +use super::big_update::create_big_update; use super::{ - definitions::{ - BskyFeed, BskyList, BskyPost, BskyPostImage, BskyPostMediaAspectRatio, BskyPostVideo, - BskyPostVideoBlob, BskyProfile, JetstreamAccountEvent, JetstreamIdentityEvent, Record, - }, - delete_record, - utils::{self, at_uri_to_record_id, blob_ref_to_record_id, did_to_key}, + definitions::{JetstreamAccountEvent, JetstreamIdentityEvent, Record}, + delete_record, utils, }; +use crate::websocket::events::{Commit, Kind}; +use anyhow::Result; +use atrium_api::types::string::{Did, RecordKey}; +use surrealdb::{engine::any::Any, Surreal}; +use tracing::warn; /// Handle a new websocket event on the database pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { @@ -47,14 +22,12 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { let did_key = utils::did_to_key(did.as_str())?; match commit { Commit::CreateOrUpdate { - rev, collection, rkey, record, - cid, + .. } => { - let big_update = - on_commit_event_createorupdate(did, did_key, collection, rkey, record)?; + let big_update = create_big_update(did, did_key, collection, rkey, record)?; big_update.apply(db, "jetstream").await?; } Commit::Delete { @@ -103,1249 +76,6 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { Ok(()) } -#[derive(Serialize)] -struct UpdateFollow { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[derive(Serialize)] -struct UpdateLike { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[derive(Serialize)] -struct UpdateRepost { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[derive(Serialize)] -struct UpdateBlock { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[derive(Serialize)] -struct UpdateListBlock { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[derive(Serialize)] -struct UpdateListItem { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, - #[serde(rename = "createdAt")] - pub created_at: surrealdb::Datetime, -} - -#[skip_serializing_none] -#[derive(Serialize)] -struct UpdateLatestBackfill { - of: surrealdb::RecordId, - id: String, - at: Option, -} - -/// Database struct for a bluesky profile -#[derive(Debug, Serialize)] -#[allow(dead_code)] -pub struct UpdateDid { - pub id: String, - #[serde(rename = "displayName")] - pub display_name: Option, - pub description: Option, - pub avatar: Option, - pub banner: Option, - #[serde(rename = "createdAt")] - pub created_at: Option, - #[serde(rename = "seenAt")] - pub seen_at: Datetime, - #[serde(rename = "joinedViaStarterPack")] - pub joined_via_starter_pack: Option, - pub labels: Option>, - #[serde(rename = "pinnedPost")] - pub pinned_post: Option, - #[serde(rename = "extraData")] - pub extra_data: Option, -} - -#[derive(Serialize)] -pub struct UpdateFeed { - pub id: String, - pub uri: String, - pub author: RecordId, - pub rkey: String, - pub did: String, - #[serde(rename = "displayName")] - pub display_name: String, - pub description: Option, - pub avatar: Option, - #[serde(rename = "createdAt")] - pub created_at: Datetime, - #[serde(rename = "extraData")] - pub extra_data: Option, -} - -#[derive(Debug, Serialize)] -pub struct UpdateList { - pub id: String, - pub name: String, - pub purpose: String, - #[serde(rename = "createdAt")] - pub created_at: Datetime, - pub description: Option, - pub avatar: Option, - pub labels: Option>, - #[serde(rename = "extraData")] - pub extra_data: Option, -} - -#[derive(Serialize)] -struct UpdateQuote { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, -} - -#[derive(Serialize)] -struct UpdateRepliesRelation { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, -} - -#[derive(Serialize)] -struct UpdateReplyToRelation { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, -} - -#[derive(Serialize)] -struct UpdatePostsRelation { - #[serde(rename = "in")] - pub from: surrealdb::RecordId, - #[serde(rename = "out")] - pub to: surrealdb::RecordId, - pub id: String, -} - -#[derive(Serialize)] -struct WithId { - id: String, - #[serde(flatten)] - data: R, -} - -static QUERY_DURATION_METRIC: LazyLock> = LazyLock::new(|| { - global::meter("indexer") - .u64_histogram("indexer.database.insert_duration") - .with_unit("ms") - .with_description("Big update duration") - .with_boundaries(vec![ - 0.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0, 250.0, 500.0, 750.0, 1000.0, 2500.0, 5000.0, - 7500.0, 10000.0, 25000.0, 50000.0, 75000.0, 100000.0, 250000.0, 500000.0, 750000.0, - 1000000.0, 2500000.0, - ]) - .build() -}); -static INSERTED_ROWS_METRIC: LazyLock> = LazyLock::new(|| { - global::meter("indexer") - .u64_counter("indexer.database.inserted_elements") - .with_unit("rows") - .with_description("Inserted or updated rows") - .build() -}); -static INSERTED_SIZE_METRIC: LazyLock> = LazyLock::new(|| { - global::meter("indexer") - .u64_counter("indexer.database.inserted_bytes") - .with_unit("By") - .with_description("Inserted or updated bytes (approximation)") - .build() -}); -static TRANSACTIONS_METRIC: LazyLock> = LazyLock::new(|| { - global::meter("indexer") - .u64_counter("indexer.database.transactions") - .with_unit("By") - .with_description("Number of transactions") - .build() -}); - -struct BigUpdateInfoRow { - count: u64, - size: u64, -} -impl core::fmt::Debug for BigUpdateInfoRow { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - f.debug_map() - .entry(&"count", &self.count) - .entry(&"mB", &(self.size as f64 / 1024.0 / 1024.0)) - .finish() - } -} - -struct BigUpdateInfo { - // Info about individual tables - did: BigUpdateInfoRow, - follows: BigUpdateInfoRow, - latest_backfills: BigUpdateInfoRow, - likes: BigUpdateInfoRow, - reposts: BigUpdateInfoRow, - blocks: BigUpdateInfoRow, - listblocks: BigUpdateInfoRow, - listitems: BigUpdateInfoRow, - feeds: BigUpdateInfoRow, - lists: BigUpdateInfoRow, - threadgates: BigUpdateInfoRow, - starterpacks: BigUpdateInfoRow, - postgates: BigUpdateInfoRow, - actordeclarations: BigUpdateInfoRow, - labelerservices: BigUpdateInfoRow, - quotes: BigUpdateInfoRow, - posts: BigUpdateInfoRow, - replies_relations: BigUpdateInfoRow, - reply_to_relations: BigUpdateInfoRow, - posts_relations: BigUpdateInfoRow, - overwrite_latest_backfills: BigUpdateInfoRow, -} - -impl BigUpdateInfo { - fn all_relations(&self) -> BigUpdateInfoRow { - BigUpdateInfoRow { - count: self.likes.count - + self.reposts.count - + self.blocks.count - + self.listblocks.count - + self.listitems.count - + self.replies_relations.count - + self.reply_to_relations.count - + self.posts_relations.count - + self.quotes.count - + self.follows.count, - size: self.likes.size - + self.reposts.size - + self.blocks.size - + self.listblocks.size - + self.listitems.size - + self.replies_relations.size - + self.reply_to_relations.size - + self.posts_relations.size - + self.quotes.size - + self.follows.size, - } - } - fn all_tables(&self) -> BigUpdateInfoRow { - BigUpdateInfoRow { - count: self.did.count - + self.feeds.count - + self.lists.count - + self.threadgates.count - + self.starterpacks.count - + self.postgates.count - + self.actordeclarations.count - + self.labelerservices.count - + self.posts.count, - size: self.did.size - + self.feeds.size - + self.lists.size - + self.threadgates.size - + self.starterpacks.size - + self.postgates.size - + self.actordeclarations.size - + self.labelerservices.size - + self.posts.size, - } - } - fn all(&self) -> BigUpdateInfoRow { - BigUpdateInfoRow { - count: self.all_relations().count + self.all_tables().count, - size: self.all_relations().size + self.all_tables().size, - } - } - - fn record_metrics(&self, source: &str) { - INSERTED_ROWS_METRIC.add( - self.all().count, - &[KeyValue::new("source", source.to_string())], - ); - INSERTED_SIZE_METRIC.add( - self.all().size, - &[KeyValue::new("source", source.to_string())], - ); - TRANSACTIONS_METRIC.add(1, &[]); - } -} - -impl core::fmt::Debug for BigUpdateInfo { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - f.debug_map() - .entry(&"did", &self.did) - .entry(&"follows", &self.follows) - .entry(&"latest_backfills", &self.latest_backfills) - .entry(&"likes", &self.likes) - .entry(&"reposts", &self.reposts) - .entry(&"blocks", &self.blocks) - .entry(&"listblocks", &self.listblocks) - .entry(&"listitems", &self.listitems) - .entry(&"feeds", &self.feeds) - .entry(&"lists", &self.lists) - .entry(&"threadgates", &self.threadgates) - .entry(&"starterpacks", &self.starterpacks) - .entry(&"postgates", &self.postgates) - .entry(&"actordeclarations", &self.actordeclarations) - .entry(&"labelerservices", &self.labelerservices) - .entry(&"quotes", &self.quotes) - .entry(&"posts", &self.posts) - .entry(&"replies_relations", &self.replies_relations) - .entry(&"reply_to_relations", &self.reply_to_relations) - .entry(&"posts_relations", &self.posts_relations) - .entry( - &"overwrite_latest_backfills", - &self.overwrite_latest_backfills, - ) - .finish() - } -} - -#[derive(Default)] -pub struct BigUpdate { - /// Insert into did - did: Vec, - follows: Vec, - latest_backfills: Vec, - /// Like latest_backfills but overwrites existing records - overwrite_latest_backfills: Vec, - likes: Vec, - reposts: Vec, - blocks: Vec, - listblocks: Vec, - listitems: Vec, - feeds: Vec, - lists: Vec, - threadgates: Vec>>>, - starterpacks: Vec>>>, - postgates: Vec>>>, - actordeclarations: - Vec>>>, - labelerservices: Vec>>>, - quotes: Vec, - posts: Vec>, - replies_relations: Vec, - reply_to_relations: Vec, - posts_relations: Vec, -} -impl BigUpdate { - pub fn merge(&mut self, other: BigUpdate) { - self.did.extend(other.did); - self.follows.extend(other.follows); - self.latest_backfills.extend(other.latest_backfills); - self.likes.extend(other.likes); - self.reposts.extend(other.reposts); - self.blocks.extend(other.blocks); - self.listblocks.extend(other.listblocks); - self.listitems.extend(other.listitems); - self.feeds.extend(other.feeds); - self.lists.extend(other.lists); - self.threadgates.extend(other.threadgates); - self.starterpacks.extend(other.starterpacks); - self.postgates.extend(other.postgates); - self.actordeclarations.extend(other.actordeclarations); - self.labelerservices.extend(other.labelerservices); - self.quotes.extend(other.quotes); - self.posts.extend(other.posts); - self.replies_relations.extend(other.replies_relations); - self.reply_to_relations.extend(other.reply_to_relations); - self.posts_relations.extend(other.posts_relations); - self.overwrite_latest_backfills - .extend(other.overwrite_latest_backfills); - } - - pub fn add_timestamp(&mut self, did: &str, time: surrealdb::sql::Datetime) { - self.overwrite_latest_backfills.push(UpdateLatestBackfill { - of: RecordId::from(("did", did)), - id: did.to_string(), - at: Some(time), - }); - } - - /// Apply this update to the database - /// - /// `source` is a string describing the source of the update, used for metrics - pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { - let start = Instant::now(); - // Convert the update to a string for logging later - let info = tokio::task::block_in_place(|| self.create_info()); - - // Create the query string - let query_string = r#" - BEGIN; - INSERT IGNORE INTO did $dids RETURN NONE; - INSERT IGNORE INTO latest_backfill $latest_backfills RETURN NONE; - INSERT IGNORE INTO feed $feeds RETURN NONE; - INSERT IGNORE INTO list $lists RETURN NONE; - INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates RETURN NONE; - INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks RETURN NONE; - INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates RETURN NONE; - INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations RETURN NONE; - INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices RETURN NONE; - INSERT IGNORE INTO posts $posts RETURN NONE; - INSERT RELATION INTO quotes $quotes RETURN NONE; - INSERT RELATION INTO like $likes RETURN NONE; - INSERT RELATION INTO repost $reposts RETURN NONE; - INSERT RELATION INTO block $blocks RETURN NONE; - INSERT RELATION INTO listblock $listblocks RETURN NONE; - INSERT RELATION INTO listitem $listitems RETURN NONE; - INSERT RELATION INTO replyto $reply_to_relations RETURN NONE; - INSERT RELATION INTO quotes $quotes RETURN NONE; - INSERT RELATION INTO replies $replies_relations RETURN NONE; - INSERT RELATION INTO follow $follows RETURN NONE; - INSERT INTO latest_backfill $overwrite_latest_backfill RETURN NONE; - COMMIT; - "#; - - // Create the update query. Does not take that long; ~50ms for 30000 rows - let update = tokio::task::block_in_place(|| { - db.query(query_string) - .bind(("dids", self.did)) - .bind(("follows", self.follows)) - .bind(("latest_backfills", self.latest_backfills)) - .bind(("likes", self.likes)) - .bind(("reposts", self.reposts)) - .bind(("blocks", self.blocks)) - .bind(("listblocks", self.listblocks)) - .bind(("listitems", self.listitems)) - .bind(("feeds", self.feeds)) - .bind(("lists", self.lists)) - .bind(("threadgates", self.threadgates)) - .bind(("starterpacks", self.starterpacks)) - .bind(("postgates", self.postgates)) - .bind(("actordeclarations", self.actordeclarations)) - .bind(("labelerservices", self.labelerservices)) - .bind(("quotes", self.quotes)) - .bind(("posts", self.posts)) - .bind(("replies_relations", self.replies_relations)) - .bind(("reply_to_relations", self.reply_to_relations)) - .bind(("posts_relations", self.posts_relations)) - .bind(("overwrite_latest_backfill", self.overwrite_latest_backfills)) - .into_future() - .instrument(span!(Level::INFO, "query")) - }); - - let preparation_duration = start.elapsed(); - let after_update = Instant::now(); - update.await?; - let update_duration = after_update.elapsed(); - QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); - info.record_metrics(source); - - trace!( - "Applied updated: {} elements, {}MB, {:03}ms preparation, {:03}ms applying", - info.all().count, - info.all().size as f64 / 1024.0 / 1024.0, - preparation_duration.as_millis(), - update_duration.as_millis(), - ); - debug!("Detailed infos: {:?}", info); - - Ok(()) - } - - fn create_info(&self) -> BigUpdateInfo { - BigUpdateInfo { - did: BigUpdateInfoRow { - count: self.did.len() as u64, - size: self - .did - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - follows: BigUpdateInfoRow { - count: self.follows.len() as u64, - size: self - .follows - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - latest_backfills: BigUpdateInfoRow { - count: self.latest_backfills.len() as u64, - size: self - .latest_backfills - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - likes: BigUpdateInfoRow { - count: self.likes.len() as u64, - size: self - .likes - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - reposts: BigUpdateInfoRow { - count: self.reposts.len() as u64, - size: self - .reposts - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - blocks: BigUpdateInfoRow { - count: self.blocks.len() as u64, - size: self - .blocks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - listblocks: BigUpdateInfoRow { - count: self.listblocks.len() as u64, - size: self - .listblocks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - listitems: BigUpdateInfoRow { - count: self.listitems.len() as u64, - size: self - .listitems - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - feeds: BigUpdateInfoRow { - count: self.feeds.len() as u64, - size: self - .feeds - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - lists: BigUpdateInfoRow { - count: self.lists.len() as u64, - size: self - .lists - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - threadgates: BigUpdateInfoRow { - count: self.threadgates.len() as u64, - size: self - .threadgates - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - starterpacks: BigUpdateInfoRow { - count: self.starterpacks.len() as u64, - size: self - .starterpacks - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - postgates: BigUpdateInfoRow { - count: self.postgates.len() as u64, - size: self - .postgates - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - actordeclarations: BigUpdateInfoRow { - count: self.actordeclarations.len() as u64, - size: self - .actordeclarations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - labelerservices: BigUpdateInfoRow { - count: self.labelerservices.len() as u64, - size: self - .labelerservices - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - quotes: BigUpdateInfoRow { - count: self.quotes.len() as u64, - size: self - .quotes - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - posts: BigUpdateInfoRow { - count: self.posts.len() as u64, - size: self - .posts - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - replies_relations: BigUpdateInfoRow { - count: self.replies_relations.len() as u64, - size: self - .replies_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - reply_to_relations: BigUpdateInfoRow { - count: self.reply_to_relations.len() as u64, - size: self - .reply_to_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - posts_relations: BigUpdateInfoRow { - count: self.posts_relations.len() as u64, - size: self - .posts_relations - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - overwrite_latest_backfills: BigUpdateInfoRow { - count: self.overwrite_latest_backfills.len() as u64, - size: self - .overwrite_latest_backfills - .iter() - .map(|e| serde_ipld_dagcbor::to_vec(e).unwrap().len() as u64) - .sum(), - }, - } - } -} - -impl core::fmt::Debug for BigUpdate { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let info = self.create_info(); - info.fmt(f) - } -} - -/// If the new commit is a create or update, handle it -#[instrument(skip(record))] -pub fn on_commit_event_createorupdate( - did: Did, - did_key: String, - collection: String, - rkey: RecordKey, - record: KnownRecord, -) -> Result { - utils::ensure_valid_rkey(rkey.to_string())?; - - let mut big_update = BigUpdate::default(); - - match record { - KnownRecord::AppBskyActorProfile(d) => { - // NOTE: using .ok() here isn't optimal, incorrect data should - // probably not be entered into the database at all, but for now - // we'll just ignore it. - let profile = UpdateDid { - id: did_key.clone(), - display_name: d.display_name.clone(), - description: d.description.clone(), - avatar: None, // TODO Implement - banner: None, // TODO Implement - created_at: d - .created_at - .as_ref() - .and_then(|dt| utils::extract_dt(dt).ok()), - seen_at: Utc::now().into(), - joined_via_starter_pack: d - .joined_via_starter_pack - .as_ref() - .and_then(|d| utils::strong_ref_to_record_id(d).ok()), - // TODO if strong_ref_to_record_id fails, it should return an error result instead of being empty - pinned_post: d - .pinned_post - .as_ref() - .and_then(|d| utils::strong_ref_to_record_id(d).ok()), - labels: d - .labels - .as_ref() - .and_then(|d| utils::extract_self_labels_profile(d)), - extra_data: process_extra_data(&d.extra_data)?, - }; - big_update.did.push(profile); - // // TODO this should be a db.upsert(...).merge(...) - // let _: Option = db - // .insert(("did", did_key)) - // .content(profile) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyGraphFollow(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - let to = utils::did_to_key(d.subject.as_str())?; - let created_at = utils::extract_dt(&d.created_at)?; - - // let query = format!( - // r#"RELATE type::thing("did", $from)->follow->type::thing("did", $to) SET id = $id, createdAt = $created_at;"# - // from, to, id, created_at - // ); - - big_update.follows.push(UpdateFollow { - from: RecordId::from(("did", from)), - to: RecordId::from(("did", to.clone())), - id: id, - created_at, - }); - - big_update.latest_backfills.push(UpdateLatestBackfill { - of: RecordId::from(("did", to.clone())), - id: to, - at: None, - }); - - // let _ = db - // .query("RELATE (type::thing('did', $from))->follow->(type::thing('did', $to)) SET id = $id, createdAt = $created_at; UPSERT (type::thing('latest_backfill', $to)) SET of = type::thing('did', $to);") - // .bind(("from", from)) - // .bind(("to", to)) - // .bind(("id", id)) - // . bind(("created_at", created_at)) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await.unwrap(); - } - KnownRecord::AppBskyFeedLike(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - let to = utils::at_uri_to_record_id(&d.subject.uri)?; - let created_at = utils::extract_dt(&d.created_at)?; - - big_update.likes.push(UpdateLike { - from: RecordId::from(("did", from)), - to: to, - id: id, - created_at, - }); - - // let query = format!( - // "RELATE did:{}->like->{} SET id = '{}', createdAt = {};", - // from, to, id, created_at - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - KnownRecord::AppBskyFeedRepost(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - let to = utils::at_uri_to_record_id(&d.subject.uri)?; - let created_at = utils::extract_dt(&d.created_at)?; - - big_update.reposts.push(UpdateRepost { - from: RecordId::from(("did", from)), - to: to, - id: id, - created_at, - }); - // let query = format!( - // "RELATE did:{}->repost->{} SET id = '{}', createdAt = {};", - // from, to, id, created_at - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - KnownRecord::AppBskyGraphBlock(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - let to = utils::did_to_key(d.subject.as_str())?; - let created_at = utils::extract_dt(&d.created_at)?; - - big_update.blocks.push(UpdateBlock { - from: RecordId::from(("did", from)), - to: RecordId::from(("did", to.clone())), - id: id, - created_at, - }); - // let query = format!( - // "RELATE did:{}->block->did:{} SET id = '{}', createdAt = {};", - // from, to, id, created_at - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - KnownRecord::AppBskyGraphListblock(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - let to = utils::at_uri_to_record_id(&d.subject)?; - let created_at = utils::extract_dt(&d.created_at)?; - - big_update.listblocks.push(UpdateListBlock { - from: RecordId::from(("did", from)), - to: to, - id: id, - created_at, - }); - // let query = format!( - // "RELATE did:{}->listblock->{} SET id = '{}', createdAt = {};", - // from, to, id, created_at - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - KnownRecord::AppBskyGraphListitem(d) => { - // TODO ensure_valid_rkey_strict(rkey.as_str())?; - let from = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), from); - - let from = utils::at_uri_to_record_id(&d.list)?; - let to = utils::did_to_key(&d.subject)?; - let created_at = utils::extract_dt(&d.created_at)?; - - big_update.listitems.push(UpdateListItem { - from: from, - to: RecordId::from(("did", to.clone())), - id: id, - created_at, - }); - - // let query = format!( - // "RELATE {}->listitem->did:{} SET id = '{}', createdAt = {};", - // from, to, id, created_at - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - KnownRecord::AppBskyFeedGenerator(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - let feed = UpdateFeed { - id: id, - author: RecordId::from_table_key("did", did_key), - avatar: None, // TODO implement - created_at: utils::extract_dt(&d.created_at)?, - description: d.description.clone(), - did: d.did.to_string(), - display_name: d.display_name.clone(), - rkey: rkey.to_string(), - uri: format!( - "at://{}/app.bsky.feed.generator/{}", - did.as_str(), - rkey.as_str() - ), - extra_data: process_extra_data(&d.extra_data)?, - }; - big_update.feeds.push(feed); - // let _: Option = db - // .upsert(("feed", id)) - // .content(feed) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyGraphList(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - - let list = UpdateList { - id: id, - name: d.name.clone(), - avatar: None, // TODO implement - created_at: utils::extract_dt(&d.created_at)?, - description: d.description.clone(), - labels: d - .labels - .as_ref() - .and_then(|d| utils::extract_self_labels_list(d)), - purpose: d.purpose.clone(), - extra_data: process_extra_data(&d.extra_data)?, - }; - big_update.lists.push(list); - // let _: Option = db - // .upsert(("list", id)) - // .content(list) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyFeedThreadgate(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.threadgates.push(WithId { id: id, data: d }); - // let _: Option = db - // .upsert(("lex_app_bsky_feed_threadgate", id)) - // .content(d) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyGraphStarterpack(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.starterpacks.push(WithId { id: id, data: d }); - // let _: Option = db - // .upsert(("lex_app_bsky_graph_starterpack", id)) - // .content(d) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyFeedPostgate(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.postgates.push(WithId { id: id, data: d }); - // let _: Option = db - // .upsert(("lex_app_bsky_feed_postgate", id)) - // .content(d) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::ChatBskyActorDeclaration(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - big_update - .actordeclarations - .push(WithId { id: id, data: d }); - // let _: Option = db - // .upsert(("lex_chat_bsky_actor_declaration", id)) - // .content(d) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyLabelerService(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.labelerservices.push(WithId { id: id, data: d }); - // let _: Option = db - // .upsert(("lex_app_bsky_labeler_service", id)) - // .content(d) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - } - KnownRecord::AppBskyFeedPost(d) => { - let did_key = utils::did_to_key(did.as_str())?; - let id = format!("{}_{}", rkey.as_str(), did_key); - - let mut images: Vec = vec![]; - let mut links: Vec = vec![]; - let mut mentions: Vec = vec![]; - let mut record: Option = None; - let mut tags: Vec = vec![]; - let mut video: Option = None; - - let mut post_images: Vec = vec![]; - - match &d.embed { - Some(d) => { - match d { - atrium_api::types::Union::Refs(e) => { - match e { - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedExternalMain(m)=>{ - // TODO index preview too - links.push(m.external.uri.clone()); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedImagesMain(m) => { - post_images=m.images.clone(); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedVideoMain(m) => { - video = Some(process_video(m)?); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordMain(m) => { - record = Some(at_uri_to_record_id(&m.record.uri)?); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordWithMediaMain(m) => { - record = Some(at_uri_to_record_id(&m.record.record.uri)?); - - match &m.media{ - atrium_api::types::Union::Refs(r)=>match r{ - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedExternalMain(m)=>{ - // TODO index preview too - links.push(m.external.uri.clone()); - } - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedImagesMain(m)=>{ - post_images=m.images.clone(); - } - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedVideoMain(m)=>{ - - video = Some(process_video(m)?); - } - } - atrium_api::types::Union::Unknown(_)=>{} - } - }, - } - } - atrium_api::types::Union::Unknown(_) => {} - } - } - None => {} - }; - - if !post_images.is_empty() { - for i in post_images { - images.push(BskyPostImage { - alt: i.alt.clone(), - blob: blob_ref_to_record_id(&i.image), // TODO store blob details - aspect_ratio: i.aspect_ratio.as_ref().map(|a| BskyPostMediaAspectRatio { - height: a.height.into(), - width: a.width.into(), - }), - }) - } - } - - if let Some(facets) = &d.facets { - for facet in facets { - for feature in &facet.features { - match feature { - atrium_api::types::Union::Refs(refs) => match refs { - MainFeaturesItem::Mention(m) => { - mentions.push(("did", did_to_key(m.did.as_str())?).into()); - } - MainFeaturesItem::Link(l) => { - links.push(l.uri.clone()); - } - MainFeaturesItem::Tag(t) => { - tags.push(t.tag.clone()); - } - }, - atrium_api::types::Union::Unknown(_) => {} - } - } - } - } - - if let Some(t) = &d.tags { - tags.extend(t.clone()); - } - - if let Some(r) = &record { - if r.table() == "post" { - big_update.quotes.push(UpdateQuote { - from: RecordId::from_table_key("post", id.clone()), - to: r.clone(), - id: id.clone(), - }); - - // let query = format!( - // "RELATE post:{}->quotes->post:{} SET id = '{}';", - // id, - // r.key(), - // id - // ); - - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - } - - let post = WithId { - id: id.clone(), - data: BskyPost { - author: RecordId::from_table_key("did", did_key.clone()), - bridgy_original_url: None, - via: None, - created_at: utils::extract_dt(&d.created_at)?, - labels: d - .labels - .as_ref() - .and_then(|d| utils::extract_self_labels_post(d)), - text: d.text.clone(), - langs: d - .langs - .as_ref() - .map(|d| d.iter().map(|l| l.as_ref().to_string()).collect()), - root: d - .reply - .as_ref() - .map(|r| utils::strong_ref_to_record_id(&r.root)) - .transpose()?, - parent: d - .reply - .as_ref() - .map(|r| utils::strong_ref_to_record_id(&r.parent)) - .transpose()?, - video: video, - tags: if tags.is_empty() { None } else { Some(tags) }, - links: if links.is_empty() { None } else { Some(links) }, - mentions: if mentions.is_empty() { - None - } else { - Some(mentions) - }, - record: record, - images: if images.is_empty() { - None - } else { - Some(images) - }, - extra_data: process_extra_data(&d.extra_data)?, - }, - }; - - let parent = post.data.parent.clone(); - big_update.posts.push(post); - // let _: Option = db - // .upsert(("post", id.clone())) - // .content(post) - // .into_future() - // .instrument(span!(Level::INFO, "upsert")) - // .await?; - - if parent.is_some() { - big_update.replies_relations.push(UpdateRepliesRelation { - from: RecordId::from_table_key("did", did_key.clone()), - to: RecordId::from_table_key("post", id.clone()), - id: id.clone(), - }); - // let query1 = format!( - // "RELATE did:{}->replies->post:{} SET id = '{}';", - // did_key, id, id - // ); - // let _ = db - // .query(query1) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - - big_update.reply_to_relations.push(UpdateReplyToRelation { - from: RecordId::from_table_key("post", id.clone()), - to: parent.unwrap(), - id: id.clone(), - }); - // let query2 = format!( - // "RELATE post:{}->replyto->{} SET id = '{}';", - // id, - // parent.unwrap(), - // id - // ); - // let _ = db - // .query(query2) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } else { - big_update.posts_relations.push(UpdatePostsRelation { - from: RecordId::from_table_key("did", did_key.clone()), - to: RecordId::from_table_key("post", id.clone()), - id: id.clone(), - }); - // let query = format!( - // "RELATE did:{}->posts->post:{} SET id = '{}';", - // did_key, id, id - // ); - // let _ = db - // .query(query) - // .into_future() - // .instrument(span!(Level::INFO, "query")) - // .await?; - } - } - _ => { - warn!(target: "indexer", "ignored create_or_update {} {} {}", - did.as_str(), collection, rkey.as_str()); - } - } - - Ok(big_update) -} - -fn process_video(vid: &video::Main) -> Result { - let blob = extract_video_blob(&vid.video)?; - let v = BskyPostVideo { - alt: vid.alt.clone(), - aspect_ratio: vid.aspect_ratio.clone().map(|a| BskyPostMediaAspectRatio { - height: a.height.into(), - width: a.width.into(), - }), - blob: BskyPostVideoBlob { - cid: blob.r#ref.0.to_string(), - media_type: blob.mime_type, - size: blob.size as u64, - }, - captions: None, // TODO implement - }; - Ok(v) -} -fn extract_video_blob(blob: &BlobRef) -> Result { - match blob { - atrium_api::types::BlobRef::Typed(a) => match a { - atrium_api::types::TypedBlobRef::Blob(b) => Ok(b.clone()), - }, - atrium_api::types::BlobRef::Untyped(_) => anyhow::bail!("Invalid blob ref type"), - } -} - /// If the new commit is a delete, handle it async fn on_commit_event_delete( db: &Surreal, @@ -1412,8 +142,3 @@ async fn on_commit_event_delete( Ok(()) } - -fn process_extra_data(ipld: &ipld_core::ipld::Ipld) -> Result> { - let str = simd_json::serde::to_string(ipld)?; - Ok(if str == "{}" { None } else { Some(str) }) -} diff --git a/src/database/mod.rs b/src/database/mod.rs index aa7f8d3..4e92b05 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -4,6 +4,7 @@ use definitions::{JetstreamCursor, Record}; use surrealdb::{engine::any::Any, opt::auth::Root, RecordId, Surreal}; use tracing::info; +pub mod big_update; pub mod definitions; pub mod handlers; pub mod repo_indexer; diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index e66d944..17dd1be 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -2,7 +2,7 @@ use super::pipeline::Stage; use crate::{ config::ARGS, database::{ - handlers::{on_commit_event_createorupdate, BigUpdate}, + big_update::{create_big_update, BigUpdate}, repo_indexer::pipeline::NoNextStage, }, }; @@ -111,7 +111,7 @@ fn convert_repo_to_update( let collection = parts.next()?.to_string(); let rkey = RecordKey::new(parts.next()?.to_string()).ok()?; - let update = on_commit_event_createorupdate( + let update = create_big_update( Did::new(did.clone().into()).unwrap(), did_key.clone(), collection, From 8eb1c2163327774aba21fdfdbdd81e7f1cf90376 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:42:09 +0100 Subject: [PATCH 58/75] Add surrealDB to the setup script --- setup.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.sh b/setup.sh index da439c8..3de29bf 100644 --- a/setup.sh +++ b/setup.sh @@ -11,6 +11,7 @@ rustup default nightly if ! git status; then git clone https://github.com/zebreus/indexer-rust cd indexer-rust + git checkout testing fi cargo install samply echo '1' >/proc/sys/kernel/perf_event_paranoid @@ -20,6 +21,9 @@ wget https://github.com/zebreus/upload/releases/download/v0.2/upload.binary chmod +x upload.binary mv upload.binary /usr/local/bin/upload +curl -sSf https://tiup-mirrors.pingcap.com/install.sh | sh +curl -sSf https://install.surrealdb.com | sh + export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291" echo 'export OTEL_EXPORTER_OTLP_ENDPOINT="http://monitoring.indexer.skyfeedlol.lol:39291"' >~/.bashrc From b9b3fdcb08c083c8a861c0fb9c090242bb00ab19 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:43:31 +0100 Subject: [PATCH 59/75] Apply automatic clippy fixed --- src/config.rs | 2 +- src/database/big_update.rs | 119 +++++++++++------------ src/database/handlers.rs | 2 +- src/database/repo_indexer/index_repo.rs | 8 +- src/database/repo_indexer/pipeline.rs | 16 ++- src/database/repo_indexer/repo_stream.rs | 4 +- src/database/utils.rs | 2 +- src/main.rs | 6 +- src/observability/otel_providers.rs | 4 +- src/websocket/conn.rs | 2 +- src/websocket/events.rs | 2 +- src/websocket/mod.rs | 6 +- 12 files changed, 84 insertions(+), 89 deletions(-) diff --git a/src/config.rs b/src/config.rs index 621ea68..45ba5d3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -75,7 +75,7 @@ pub struct Args { pub repo_stream_buffer_size: usize, } -pub const ARGS: LazyLock = LazyLock::new(|| Args::parse()); +pub const ARGS: LazyLock = LazyLock::new(Args::parse); // impl Args { // /// Dump configuration to log diff --git a/src/database/big_update.rs b/src/database/big_update.rs index ec4660a..061c972 100644 --- a/src/database/big_update.rs +++ b/src/database/big_update.rs @@ -733,7 +733,7 @@ pub fn create_big_update( labels: d .labels .as_ref() - .and_then(|d| utils::extract_self_labels_profile(d)), + .and_then(utils::extract_self_labels_profile), extra_data: process_extra_data(&d.extra_data)?, }; big_update.did.push(profile); @@ -748,7 +748,7 @@ pub fn create_big_update( big_update.follows.push(UpdateFollow { from: RecordId::from(("did", from)), to: RecordId::from(("did", to.clone())), - id: id, + id, created_at, }); @@ -767,8 +767,8 @@ pub fn create_big_update( big_update.likes.push(UpdateLike { from: RecordId::from(("did", from)), - to: to, - id: id, + to, + id, created_at, }); } @@ -781,8 +781,8 @@ pub fn create_big_update( big_update.reposts.push(UpdateRepost { from: RecordId::from(("did", from)), - to: to, - id: id, + to, + id, created_at, }); } @@ -796,7 +796,7 @@ pub fn create_big_update( big_update.blocks.push(UpdateBlock { from: RecordId::from(("did", from)), to: RecordId::from(("did", to.clone())), - id: id, + id, created_at, }); } @@ -809,8 +809,8 @@ pub fn create_big_update( big_update.listblocks.push(UpdateListBlock { from: RecordId::from(("did", from)), - to: to, - id: id, + to, + id, created_at, }); } @@ -824,9 +824,9 @@ pub fn create_big_update( let created_at = utils::extract_dt(&d.created_at)?; big_update.listitems.push(UpdateListItem { - from: from, + from, to: RecordId::from(("did", to.clone())), - id: id, + id, created_at, }); } @@ -834,7 +834,7 @@ pub fn create_big_update( let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); let feed = UpdateFeed { - id: id, + id, author: RecordId::from_table_key("did", did_key), avatar: None, // TODO implement created_at: utils::extract_dt(&d.created_at)?, @@ -856,7 +856,7 @@ pub fn create_big_update( let id = format!("{}_{}", rkey.as_str(), did_key); let list = UpdateList { - id: id, + id, name: d.name.clone(), avatar: None, // TODO implement created_at: utils::extract_dt(&d.created_at)?, @@ -864,7 +864,7 @@ pub fn create_big_update( labels: d .labels .as_ref() - .and_then(|d| utils::extract_self_labels_list(d)), + .and_then(utils::extract_self_labels_list), purpose: d.purpose.clone(), extra_data: process_extra_data(&d.extra_data)?, }; @@ -873,29 +873,29 @@ pub fn create_big_update( KnownRecord::AppBskyFeedThreadgate(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.threadgates.push(WithId { id: id, data: d }); + big_update.threadgates.push(WithId { id, data: d }); } KnownRecord::AppBskyGraphStarterpack(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.starterpacks.push(WithId { id: id, data: d }); + big_update.starterpacks.push(WithId { id, data: d }); } KnownRecord::AppBskyFeedPostgate(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.postgates.push(WithId { id: id, data: d }); + big_update.postgates.push(WithId { id, data: d }); } KnownRecord::ChatBskyActorDeclaration(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); big_update .actordeclarations - .push(WithId { id: id, data: d }); + .push(WithId { id, data: d }); } KnownRecord::AppBskyLabelerService(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - big_update.labelerservices.push(WithId { id: id, data: d }); + big_update.labelerservices.push(WithId { id, data: d }); } KnownRecord::AppBskyFeedPost(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -910,50 +910,47 @@ pub fn create_big_update( let mut post_images: Vec = vec![]; - match &d.embed { - Some(d) => { - match d { - atrium_api::types::Union::Refs(e) => { - match e { - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedExternalMain(m)=>{ - // TODO index preview too - links.push(m.external.uri.clone()); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedImagesMain(m) => { - post_images=m.images.clone(); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedVideoMain(m) => { - video = Some(process_video(m)?); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordMain(m) => { - record = Some(at_uri_to_record_id(&m.record.uri)?); - }, - atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordWithMediaMain(m) => { - record = Some(at_uri_to_record_id(&m.record.record.uri)?); + if let Some(d) = &d.embed { + match d { + atrium_api::types::Union::Refs(e) => { + match e { + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedExternalMain(m)=>{ + // TODO index preview too + links.push(m.external.uri.clone()); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedImagesMain(m) => { + post_images=m.images.clone(); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedVideoMain(m) => { + video = Some(process_video(m)?); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordMain(m) => { + record = Some(at_uri_to_record_id(&m.record.uri)?); + }, + atrium_api::app::bsky::feed::post::RecordEmbedRefs::AppBskyEmbedRecordWithMediaMain(m) => { + record = Some(at_uri_to_record_id(&m.record.record.uri)?); - match &m.media{ - atrium_api::types::Union::Refs(r)=>match r{ - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedExternalMain(m)=>{ - // TODO index preview too - links.push(m.external.uri.clone()); - } - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedImagesMain(m)=>{ - post_images=m.images.clone(); - } - atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedVideoMain(m)=>{ + match &m.media{ + atrium_api::types::Union::Refs(r)=>match r{ + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedExternalMain(m)=>{ + // TODO index preview too + links.push(m.external.uri.clone()); + } + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedImagesMain(m)=>{ + post_images=m.images.clone(); + } + atrium_api::app::bsky::embed::record_with_media::MainMediaRefs::AppBskyEmbedVideoMain(m)=>{ - video = Some(process_video(m)?); - } - } - atrium_api::types::Union::Unknown(_)=>{} + video = Some(process_video(m)?); } - }, - } - } - atrium_api::types::Union::Unknown(_) => {} + } + atrium_api::types::Union::Unknown(_)=>{} + } + }, + } } + atrium_api::types::Union::Unknown(_) => {} } - None => {} }; if !post_images.is_empty() { @@ -1014,7 +1011,7 @@ pub fn create_big_update( labels: d .labels .as_ref() - .and_then(|d| utils::extract_self_labels_post(d)), + .and_then(utils::extract_self_labels_post), text: d.text.clone(), langs: d .langs @@ -1030,7 +1027,7 @@ pub fn create_big_update( .as_ref() .map(|r| utils::strong_ref_to_record_id(&r.parent)) .transpose()?, - video: video, + video, tags: if tags.is_empty() { None } else { Some(tags) }, links: if links.is_empty() { None } else { Some(links) }, mentions: if mentions.is_empty() { @@ -1038,7 +1035,7 @@ pub fn create_big_update( } else { Some(mentions) }, - record: record, + record, images: if images.is_empty() { None } else { diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 3647ae5..0dd5a60 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -106,7 +106,7 @@ async fn on_commit_event_delete( delete_record(db, "listblock", &id).await?; } "app.bsky.feed.post" => { - for table in vec!["post", "posts", "replies", "replyto", "quotes"] { + for table in ["post", "posts", "replies", "replyto", "quotes"] { delete_record(db, table, &id).await?; } } diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 17dd1be..ccb0836 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -88,7 +88,7 @@ fn convert_repo_to_update( // Create references to the files and the did, so we can use them in the closure let files_ref = &files; - let did_key = &crate::database::utils::did_to_key(&did)?; + let did_key = &crate::database::utils::did_to_key(did)?; let mut update = files_ref .iter() @@ -112,7 +112,7 @@ fn convert_repo_to_update( let collection = parts.next()?.to_string(); let rkey = RecordKey::new(parts.next()?.to_string()).ok()?; let update = create_big_update( - Did::new(did.clone().into()).unwrap(), + Did::new(did.clone()).unwrap(), did_key.clone(), collection, rkey, @@ -128,7 +128,7 @@ fn convert_repo_to_update( })?; // Add the timestamp of when we retrieved the repo to the update - update.add_timestamp(&did, retrieval_time); + update.add_timestamp(did, retrieval_time); Ok(update) } @@ -204,7 +204,7 @@ impl Stage for DownloadService { self.common.did ))?; Ok(DownloadRepo { - service: service, + service, common: self.common, }) } diff --git a/src/database/repo_indexer/pipeline.rs b/src/database/repo_indexer/pipeline.rs index b096b6e..6dc5f35 100644 --- a/src/database/repo_indexer/pipeline.rs +++ b/src/database/repo_indexer/pipeline.rs @@ -52,9 +52,7 @@ impl< type Next = O; const NAME: &'static str = "First"; const FIRST: bool = true; - fn run(self) -> impl Future> + Send + Sync + 'static { - async move { Ok((self.f)(self.a)) } - } + async fn run(self) -> anyhow::Result { Ok((self.f)(self.a)) } } pub fn create_stage< @@ -67,14 +65,14 @@ pub fn create_stage< let next_stage_fn = next_stage::>(); let boxedfn = Arc::new(f); - return move |x| { + move |x| { let first_stage = FirstStage:: { a: x, b: PhantomData, f: boxedfn.clone(), }; - return (next_stage_fn)(first_stage); - }; + (next_stage_fn)(first_stage) + } } pub fn next_stage( @@ -116,7 +114,7 @@ where .with_unit("tasks") .build() }); - return |x: FROM| { + |x: FROM| { async move { tokio::task::spawn(async move { // Move from queued to active @@ -233,11 +231,11 @@ where duration.as_millis() as f64 / 1000.0 ); - return Some(result); + Some(result) }) .await .expect("Failed to spawn task in a pump stage. This is a hard error and means that something is wrong with your system. Maybe go buy a bigger machine or something?") } .boxed() - }; + } } diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index ab86b9d..a2f5ba1 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -29,12 +29,12 @@ struct LatestBackfill { impl RepoStream { pub fn new(db: Surreal) -> Self { - return Self { + Self { buffer: VecDeque::new(), processed_dids: HashSet::new(), db, db_future: None, - }; + } } } diff --git a/src/database/utils.rs b/src/database/utils.rs index d0c4668..861da59 100644 --- a/src/database/utils.rs +++ b/src/database/utils.rs @@ -101,7 +101,7 @@ pub fn unsafe_user_key_to_did(key: &str) -> String { /// Converts a strong ref to a record ID pub fn strong_ref_to_record_id(sr: &Main) -> Result { - Ok(at_uri_to_record_id(&sr.uri).context("Unable to convert strong ref to record id")?) + at_uri_to_record_id(&sr.uri).context("Unable to convert strong ref to record id") } /// Converts an AT URI to a record ID diff --git a/src/main.rs b/src/main.rs index 10d48ef..b3c8709 100644 --- a/src/main.rs +++ b/src/main.rs @@ -68,14 +68,14 @@ async fn application_main() -> anyhow::Result<()> { let _otel_guard = init_observability().await; // Connect to the database - let db = database::connect(&ARGS.db.first().unwrap()) + let db = database::connect(ARGS.db.first().unwrap()) .await .context("Failed to connect to the database")?; // Create tasks let metrics_task = export_system_metrics().boxed(); - let jetstream_task = attach_jetstream((&db).to_owned(), ARGS.certificate.clone()).boxed(); - let indexer_task = start_full_repo_indexer((&db).to_owned()).boxed_local(); + let jetstream_task = attach_jetstream(db.to_owned(), ARGS.certificate.clone()).boxed(); + let indexer_task = start_full_repo_indexer(db.to_owned()).boxed_local(); // Add all tasks to a list let mut tasks: FuturesUnordered>>>> = diff --git a/src/observability/otel_providers.rs b/src/observability/otel_providers.rs index 4ce948e..79c8470 100644 --- a/src/observability/otel_providers.rs +++ b/src/observability/otel_providers.rs @@ -175,7 +175,7 @@ impl OtelProviders { /// Does nothing if already shutdown pub fn shutdown(&self) { let shutdown = self.shutdown.lock(); - if shutdown.as_ref().map_or(false, |shutdown| **shutdown) { + if shutdown.as_ref().is_ok_and(|shutdown| **shutdown) { // Already shutdown return; } @@ -272,7 +272,7 @@ impl OtelProviders { if let Some(metrics_layer) = self.otel_metrics_layer() { layers.push(Box::new(metrics_layer)); } - return layers; + layers } } diff --git a/src/websocket/conn.rs b/src/websocket/conn.rs index 9d6c4b4..ab99af7 100644 --- a/src/websocket/conn.rs +++ b/src/websocket/conn.rs @@ -54,7 +54,7 @@ pub async fn connect_tls( let uri = format!( "wss://{}/subscribe?maxMessageSizeBytes=1048576{}", host, - cursor.map_or_else(|| String::new(), |c| format!("&cursor={}", c)) + cursor.map_or_else(String::new, |c| format!("&cursor={}", c)) ); info!(target: "indexer", "Connecting to {}", uri); diff --git a/src/websocket/events.rs b/src/websocket/events.rs index 318aa5c..6482f0c 100644 --- a/src/websocket/events.rs +++ b/src/websocket/events.rs @@ -69,5 +69,5 @@ pub enum Kind { /// Parse an event from a string pub fn parse_event(mut msg: String) -> anyhow::Result { - Ok(unsafe { simd_json::from_str(msg.as_mut_str()) }.context("Failed to parse event")?) + unsafe { simd_json::from_str(msg.as_mut_str()) }.context("Failed to parse event") } diff --git a/src/websocket/mod.rs b/src/websocket/mod.rs index 61f810b..7427e6e 100644 --- a/src/websocket/mod.rs +++ b/src/websocket/mod.rs @@ -81,7 +81,7 @@ pub async fn start( loop { // get current cursor let cursor = { - let c = (&state).cursor.load(Ordering::Relaxed) as u64; + let c = state.cursor.load(Ordering::Relaxed); if c == 0 { None } else { @@ -109,7 +109,7 @@ pub async fn start( // rewind cursor by 10 seconds { const REWIND_TIME: u64 = 10_000_000; // 10 seconds in microseconds - let cursor = (&state).cursor.fetch_sub(REWIND_TIME, Ordering::Relaxed); + let cursor = state.cursor.fetch_sub(REWIND_TIME, Ordering::Relaxed); info!(target: "indexer", "Rewinding cursor by 10 seconds: {} -> {}", cursor, cursor - REWIND_TIME); } @@ -154,7 +154,7 @@ async fn manage_ws( let text = String::from_utf8(msg.payload.to_vec()) .context("Failed to decode text message")?; - let res = handler::handle_message(&state, text, update_cursor).await; + let res = handler::handle_message(state, text, update_cursor).await; if res.is_err() { warn!("error while handling {}", res.unwrap_err()); From f256aa5f59c23ffc101e7487987a8f3d88dfab9c Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:54:52 +0100 Subject: [PATCH 60/75] Fix remaining clippy issues --- src/config.rs | 2 +- src/database/handlers.rs | 6 +++--- src/database/repo_indexer/index_repo.rs | 4 ++-- src/database/repo_indexer/pipeline.rs | 8 +++++--- src/database/repo_indexer/repo_stream.rs | 9 +++------ src/database/utils.rs | 14 +++++++------- src/main.rs | 13 +++++-------- src/observability/otel_providers.rs | 5 +---- src/websocket/events.rs | 6 +++--- src/websocket/handler.rs | 6 +++--- 10 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/config.rs b/src/config.rs index 45ba5d3..43e916a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -75,7 +75,7 @@ pub struct Args { pub repo_stream_buffer_size: usize, } -pub const ARGS: LazyLock = LazyLock::new(Args::parse); +pub static ARGS: LazyLock = LazyLock::new(Args::parse); // impl Args { // /// Dump configuration to log diff --git a/src/database/handlers.rs b/src/database/handlers.rs index 0dd5a60..6806026 100644 --- a/src/database/handlers.rs +++ b/src/database/handlers.rs @@ -13,7 +13,7 @@ use tracing::warn; pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { // Handle event types match event { - Kind::CommitEvent { + Kind::Commit { did, time_us, commit, @@ -39,7 +39,7 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { } } } - Kind::IdentityEvent { + Kind::Identity { did, time_us, identity, @@ -55,7 +55,7 @@ pub async fn handle_event(db: &Surreal, event: Kind) -> Result<()> { }) .await?; } - Kind::KeyEvent { + Kind::Key { did, time_us, account, diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index ccb0836..84874a6 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -71,7 +71,7 @@ pub struct NodeData { #[instrument(skip_all)] fn convert_repo_to_update( repo: Vec, - did: &String, + did: &str, retrieval_time: surrealdb::sql::Datetime, ) -> anyhow::Result { // Deserialize CAR file @@ -112,7 +112,7 @@ fn convert_repo_to_update( let collection = parts.next()?.to_string(); let rkey = RecordKey::new(parts.next()?.to_string()).ok()?; let update = create_big_update( - Did::new(did.clone()).unwrap(), + Did::new(did.to_string()).unwrap(), did_key.clone(), collection, rkey, diff --git a/src/database/repo_indexer/pipeline.rs b/src/database/repo_indexer/pipeline.rs index 6dc5f35..bbc0d49 100644 --- a/src/database/repo_indexer/pipeline.rs +++ b/src/database/repo_indexer/pipeline.rs @@ -52,7 +52,9 @@ impl< type Next = O; const NAME: &'static str = "First"; const FIRST: bool = true; - async fn run(self) -> anyhow::Result { Ok((self.f)(self.a)) } + async fn run(self) -> anyhow::Result { + Ok((self.f)(self.a)) + } } pub fn create_stage< @@ -75,10 +77,10 @@ pub fn create_stage< } } -pub fn next_stage( +pub fn next_stage( ) -> impl Fn(FROM) -> Pin> + Send + 'static>> where - FROM: Send + Sync + 'static + Stage, + FROM: Stage + Send + Sync + 'static, FROM::Next: Send + Sync + 'static, { static TRACKER: LazyLock> = LazyLock::new(|| { diff --git a/src/database/repo_indexer/repo_stream.rs b/src/database/repo_indexer/repo_stream.rs index a2f5ba1..c7049b9 100644 --- a/src/database/repo_indexer/repo_stream.rs +++ b/src/database/repo_indexer/repo_stream.rs @@ -4,20 +4,17 @@ use serde::Deserialize; use std::{ collections::{HashSet, VecDeque}, future::{Future, IntoFuture}, + pin::Pin, task::Poll, }; -use surrealdb::{engine::any::Any, Surreal}; +use surrealdb::{engine::any::Any, Response, Surreal}; use tracing::{error, trace}; pub struct RepoStream { buffer: VecDeque, processed_dids: HashSet, db: Surreal, - db_future: Option< - std::pin::Pin< - Box> + Send>, - >, - >, + db_future: Option> + Send>>>, } #[allow(dead_code)] diff --git a/src/database/utils.rs b/src/database/utils.rs index 861da59..f75f2ed 100644 --- a/src/database/utils.rs +++ b/src/database/utils.rs @@ -72,10 +72,10 @@ pub fn did_to_key(did: &str) -> Result { /// Converts a DID to a (full) key pub fn did_to_key_impl(did: &str, full: bool) -> Result { // did:plc covers 99.99% of all DIDs - let val = if did.starts_with("did:plc:") { - format!("plc_{}", &did[8..]) - } else if did.starts_with("did:web:") { - format!("web_{}", &did[8..].replace('.', "_").replace('-', "__")) + let val = if let Some(id) = did.strip_prefix("did:plc:") { + format!("plc_{}", id) + } else if let Some(id) = did.strip_prefix("did:web:") { + format!("web_{}", &id.replace('.', "_").replace('-', "__")) } else { anyhow::bail!("Invalid DID {}", did); }; @@ -85,10 +85,10 @@ pub fn did_to_key_impl(did: &str, full: bool) -> Result { } if full { - Ok(format!("did:{}", val)) - } else { - Ok(val) + return Ok(format!("did:{}", val)); } + + Ok(val) } pub fn unsafe_user_key_to_did(key: &str) -> String { diff --git a/src/main.rs b/src/main.rs index b3c8709..eb5a972 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,8 +6,6 @@ use jetstream_consumer::attach_jetstream; use metrics_reporter::export_system_metrics; use observability::init_observability; use std::{ - future::Future, - pin::Pin, process::exit, sync::atomic::{AtomicUsize, Ordering}, time::Duration, @@ -78,15 +76,14 @@ async fn application_main() -> anyhow::Result<()> { let indexer_task = start_full_repo_indexer(db.to_owned()).boxed_local(); // Add all tasks to a list - let mut tasks: FuturesUnordered>>>> = - FuturesUnordered::new(); - tasks.push(metrics_task); - if ARGS.jetstream.unwrap_or(true) { - tasks.push(jetstream_task); - } + let mut tasks: FuturesUnordered<_> = FuturesUnordered::new(); if ARGS.backfill.unwrap_or(true) { tasks.push(indexer_task); } + if ARGS.jetstream.unwrap_or(true) { + tasks.push(jetstream_task); + } + tasks.push(metrics_task); // Wait for the first task to exit let first_exited_task = tasks.next().await; diff --git a/src/observability/otel_providers.rs b/src/observability/otel_providers.rs index 79c8470..a9928ae 100644 --- a/src/observability/otel_providers.rs +++ b/src/observability/otel_providers.rs @@ -26,13 +26,10 @@ use std::sync::{ use tracing::Subscriber; use tracing_subscriber::{registry::LookupSpan, EnvFilter, Layer}; -const RESOURCE: LazyLock = LazyLock::new(|| { - // let instance_id = Uuid::new_v4(); - +static RESOURCE: LazyLock = LazyLock::new(|| { let mut attributes = vec![ KeyValue::new(SERVICE_NAME, env!("CARGO_PKG_NAME")), KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION")), - // KeyValue::new(SERVICE_INSTANCE_ID, instance_id.to_string()), KeyValue::new(DEPLOYMENT_ENVIRONMENT_NAME, "develop"), ]; diff --git a/src/websocket/events.rs b/src/websocket/events.rs index 6482f0c..d112da2 100644 --- a/src/websocket/events.rs +++ b/src/websocket/events.rs @@ -48,19 +48,19 @@ pub struct Account { #[serde(tag = "kind")] pub enum Kind { #[serde(rename = "commit")] - CommitEvent { + Commit { did: Did, time_us: u64, commit: Commit, }, #[serde(rename = "identity")] - IdentityEvent { + Identity { did: Did, time_us: u64, identity: Identity, }, #[serde(rename = "account")] - KeyEvent { + Key { did: Did, time_us: u64, account: Account, diff --git a/src/websocket/handler.rs b/src/websocket/handler.rs index 046875f..2f33697 100644 --- a/src/websocket/handler.rs +++ b/src/websocket/handler.rs @@ -15,9 +15,9 @@ pub async fn handle_message( // update cursor let time = match &event { - events::Kind::CommitEvent { time_us, .. } => *time_us, - events::Kind::IdentityEvent { time_us, .. } => *time_us, - events::Kind::KeyEvent { time_us, .. } => *time_us, + events::Kind::Commit { time_us, .. } => *time_us, + events::Kind::Identity { time_us, .. } => *time_us, + events::Kind::Key { time_us, .. } => *time_us, }; state.update_cursor(time); if update_cursor { From 6161033a52a1afccb66a8dc3527669f4e96c8523 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 21:56:12 +0100 Subject: [PATCH 61/75] Delete some commented out code --- src/config.rs | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/src/config.rs b/src/config.rs index 43e916a..cc087e6 100644 --- a/src/config.rs +++ b/src/config.rs @@ -76,42 +76,3 @@ pub struct Args { } pub static ARGS: LazyLock = LazyLock::new(Args::parse); - -// impl Args { -// /// Dump configuration to log -// pub fn dump(self: &Self) { -// // dump configuration -// info!("{}", "Configuration:".bold().underline().blue()); -// info!("{}: {}", "Certificate".cyan(), self.certificate.green()); -// info!( -// "{}: {}", -// "Threads".cyan(), -// self.threads.map_or_else( -// || "Not set, using CPU count".yellow(), -// |v| v.to_string().green() -// ) -// ); -// info!( -// "{}: {}", -// "Max tasks".cyan(), -// self.max_tasks.map_or_else( -// || "Not set, using CPU count times 32".yellow(), -// |v| v.to_string().green() -// ) -// ); -// info!( -// "{}: {}", -// "Verbosity Level".cyan(), -// self.log_level().to_string().green() -// ); -// } - -// /// Verbosity to log level -// pub fn log_level(self: &Self) -> LevelFilter { -// match self.verbosity { -// 0 => LevelFilter::INFO, -// 1 => LevelFilter::DEBUG, -// _ => LevelFilter::TRACE, -// } -// } -// } From e9d9e99f8bc94e5e6b93fd26da2539e4ea3f66d8 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Wed, 5 Mar 2025 22:16:32 +0100 Subject: [PATCH 62/75] Improve cli options --- src/config.rs | 47 +++++++++++-------------- src/database/repo_indexer/index_repo.rs | 2 +- src/jetstream_consumer.rs | 18 +++++++--- src/main.rs | 5 ++- src/observability.rs | 2 +- src/observability/otel_providers.rs | 6 ++-- 6 files changed, 41 insertions(+), 39 deletions(-) diff --git a/src/config.rs b/src/config.rs index cc087e6..9700e1b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,4 +1,4 @@ -use clap::{ArgAction, Parser}; +use clap::Parser; use std::sync::LazyLock; /// Command line arguments @@ -11,10 +11,8 @@ pub struct Args { /// Set the tokio threadpool size. The default value is the number of cores available to the system. #[arg(long)] pub threads: Option, - /// Override parallel task count for full repo index operations - #[arg(long)] - pub max_tasks: Option, /// Endpoint of the database server (including port and protocol) + /// You can specify multiple surrealdbs by repeating this argument, but they should all point to the same underlying datastore #[arg(short = 'D', long, num_args=1..=16)] pub db: Vec, /// Username for the database server @@ -23,33 +21,30 @@ pub struct Args { /// Password for the database server #[arg(short, long, default_value = "root")] pub password: String, - /// Debug verbosity level - #[arg(short, action = ArgAction::Count)] - pub verbosity: u8, /// Enable backfilling of old repos - #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub backfill: Option, + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub no_backfill: bool, /// Enable attaching to the jetstream for realtime updates - #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub jetstream: Option, + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub no_jetstream: bool, /// Capacity of the surrealdb connection. 0 means unbounded #[arg(long, default_value = "0")] pub surrealdb_capacity: usize, /// Enable tokio console support #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] - pub console: Option, + pub tokio_console: bool, /// Enable opentelemetry tracing support - #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub otel_tracing: Option, - /// Enable opentelemetry metrics support - #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub otel_metrics: Option, - /// Enable opentelemetry - #[arg(long, default_value = "true", default_missing_value = "true", num_args=0..=1)] - pub otel_logs: Option, + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub otel_tracing: bool, + /// Disable opentelemetry metrics support + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub no_otel_metrics: bool, + /// Disable opentelemetry logging support + #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] + pub no_otel_logs: bool, /// Dont write to the database when backfilling #[arg(long, default_value = "false", default_missing_value = "true", num_args=0..=1)] - pub dont_write_when_backfilling: Option, + pub no_write_when_backfilling: bool, /// Size of the buffer between each pipeline stage in elements #[arg(long, default_value = "200")] pub pipeline_buffer_size: usize, @@ -57,21 +52,21 @@ pub struct Args { #[arg(long, default_value = "50")] pub pipeline_concurrent_elements: usize, /// Multiply the number of concurrent download repo tasks by this factor - #[arg(long, default_value = "4")] + #[arg(long, default_value = "8")] pub pipeline_download_concurrency_multiplier: usize, /// Timeout for a pipeline stage in seconds. No pipeline stage should take longer than this - #[arg(long, default_value = "350")] + #[arg(long, default_value = "1100")] pub pipeline_stage_timeout: u64, /// Timeout for the repo downloading pipeline stage in seconds. /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used - #[arg(long, default_value = "300")] + #[arg(long, default_value = "1000")] pub repo_download_timeout: u64, /// Timeout for downloading information from the directory in seconds. /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used - #[arg(long, default_value = "60")] + #[arg(long, default_value = "200")] pub directory_download_timeout: u64, /// Number of DIDs the RepoStream should prefetch - #[arg(long, default_value = "10000")] + #[arg(long, default_value = "5000")] pub repo_stream_buffer_size: usize, } diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index 84874a6..c1b1de8 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -265,7 +265,7 @@ impl Stage for ApplyUpdates { #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { - if !ARGS.dont_write_when_backfilling.unwrap_or(false) { + if !ARGS.no_write_when_backfilling { self.update.apply(&self.common.db, "backfill").await?; } else { warn!("Skipping writing to the database and sleeping instead"); diff --git a/src/jetstream_consumer.rs b/src/jetstream_consumer.rs index d430a9d..eb451cd 100644 --- a/src/jetstream_consumer.rs +++ b/src/jetstream_consumer.rs @@ -1,8 +1,8 @@ +use crate::{database, websocket}; use anyhow::Context; use futures::{stream::FuturesUnordered, StreamExt}; use surrealdb::{engine::any::Any, Surreal}; - -use crate::{database, websocket}; +use tracing::error; const JETSTREAM_HOSTS: [&str; 5] = [ "jetstream1.us-west.bsky.network", @@ -15,17 +15,25 @@ const JETSTREAM_HOSTS: [&str; 5] = [ pub async fn attach_jetstream(db: Surreal, certificate: String) -> anyhow::Result<()> { let mut jetstream_tasks = JETSTREAM_HOSTS .iter() - .map(|host| start_jetstream_consumer(db.clone(), host.to_string(), certificate.clone())) + .map(|host| { + tokio::task::spawn(start_jetstream_consumer( + db.clone(), + host.to_string(), + certificate.clone(), + )) + }) .collect::>(); loop { let result = jetstream_tasks.next().await; - let Some(result) = result else { + let Some(Ok(Ok(_))) = result else { + error!("Jetstream consumer task failed"); break; }; - result?; } + error!("All jetstream consumer task failed"); + Ok(()) } diff --git a/src/main.rs b/src/main.rs index eb5a972..d5c8619 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,7 +31,6 @@ fn main() { let mut rt_builder = Builder::new_multi_thread(); rt_builder .enable_all() - .worker_threads(32) .max_blocking_threads(512 * 512) .enable_time() .enable_io() @@ -77,10 +76,10 @@ async fn application_main() -> anyhow::Result<()> { // Add all tasks to a list let mut tasks: FuturesUnordered<_> = FuturesUnordered::new(); - if ARGS.backfill.unwrap_or(true) { + if !ARGS.no_backfill { tasks.push(indexer_task); } - if ARGS.jetstream.unwrap_or(true) { + if !ARGS.no_jetstream { tasks.push(jetstream_task); } tasks.push(metrics_task); diff --git a/src/observability.rs b/src/observability.rs index c0a5e9a..5b2c8b0 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -15,7 +15,7 @@ pub fn tokio_console_layer() -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { - if !ARGS.console.unwrap_or(false) { + if !ARGS.tokio_console { return None; } Some(ConsoleLayer::builder().with_default_env().spawn()) diff --git a/src/observability/otel_providers.rs b/src/observability/otel_providers.rs index a9928ae..0c4a92d 100644 --- a/src/observability/otel_providers.rs +++ b/src/observability/otel_providers.rs @@ -80,7 +80,7 @@ static RESOURCE: LazyLock = LazyLock::new(|| { }); fn init_logger() -> Option { - if !ARGS.otel_logs.unwrap_or(true) { + if ARGS.no_otel_logs { return None; } let otlp_log_exporter = LogExporter::builder().with_tonic().build().unwrap(); @@ -93,7 +93,7 @@ fn init_logger() -> Option { } fn init_meter() -> Option { - if !ARGS.otel_metrics.unwrap_or(true) { + if ARGS.no_otel_metrics { return None; } let otlp_metric_exporter = MetricExporter::builder() @@ -115,7 +115,7 @@ fn init_meter() -> Option { } fn init_tracer() -> Option { - if !ARGS.otel_tracing.unwrap_or(true) { + if !ARGS.otel_tracing { return None; } global::set_text_map_propagator(TraceContextPropagator::new()); From 9c0f7a3b167a58033347a1281258a133dc9a5a5a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 6 Mar 2025 13:17:00 +0100 Subject: [PATCH 63/75] Add retry mechanism for repo download --- src/config.rs | 5 +- src/database/repo_indexer/index_repo.rs | 76 +++++++++++++++++++++---- 2 files changed, 68 insertions(+), 13 deletions(-) diff --git a/src/config.rs b/src/config.rs index 9700e1b..12716a1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -60,7 +60,10 @@ pub struct Args { /// Timeout for the repo downloading pipeline stage in seconds. /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used #[arg(long, default_value = "1000")] - pub repo_download_timeout: u64, + pub download_repo_timeout: u64, + /// The maximum number of times to attempt to download a repo before giving up + #[arg(long, default_value = "5")] + pub download_repo_attempts: u64, /// Timeout for downloading information from the directory in seconds. /// If this is longer than the pipeline_stage_timeout, the pipeline_stage_timeout will be used #[arg(long, default_value = "200")] diff --git a/src/database/repo_indexer/index_repo.rs b/src/database/repo_indexer/index_repo.rs index c1b1de8..9a268cd 100644 --- a/src/database/repo_indexer/index_repo.rs +++ b/src/database/repo_indexer/index_repo.rs @@ -11,10 +11,11 @@ use atrium_api::{ types::string::{Did, RecordKey}, }; use ipld_core::cid::Cid; +use opentelemetry::{global, metrics::Counter}; use reqwest::Client; use serde::Deserialize; use serde_ipld_dagcbor::from_reader; -use std::{collections::HashMap, time::Duration}; +use std::{collections::HashMap, sync::LazyLock, time::Duration}; use surrealdb::{engine::any::Any, Surreal}; use tokio::task::spawn_blocking; use tracing::{instrument, span, trace, warn, Level, Span}; @@ -210,6 +211,30 @@ impl Stage for DownloadService { } } +static DOWNLOAD_REPO_RETRIES: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.pipeline.download_repo_retries") + .with_unit("{retry}") + .with_description("Number of retries for downloading a repo") + .build() +}); + +async fn attempt_download( + client: &Client, + url: &str, + timeout: Duration, +) -> anyhow::Result> { + let get_repo_response = client.get(url).timeout(timeout).send().await?; + if !get_repo_response.status().is_success() { + return Err(anyhow::anyhow!("Statuscode {}", get_repo_response.status())); + } + let repo: Vec = get_repo_response.bytes().await?.into(); + if repo.is_empty() { + return Err(anyhow::anyhow!("Downloaded repo is empty")); + } + Ok(repo) +} + impl Stage for DownloadRepo { type Next = ProcessRepo; const NAME: &str = "download_repo"; @@ -217,17 +242,44 @@ impl Stage for DownloadRepo { #[instrument(skip(self), fields(did = self.common.did), parent = self.common.span.clone())] async fn run(self) -> anyhow::Result { let retrival_time = surrealdb::sql::Datetime::from(chrono::Utc::now()); - let get_repo_response = self - .common - .http_client - .get(format!( - "{}/xrpc/com.atproto.sync.getRepo?did={}", - self.service.service_endpoint, self.common.did, - )) - .timeout(tokio::time::Duration::from_secs(ARGS.repo_download_timeout)) - .send() - .await?; - let repo: Vec = get_repo_response.bytes().await?.into(); + + // Download the repo + let mut attempts_left = ARGS.download_repo_attempts; + let repo = loop { + let get_repo_response = attempt_download( + &self.common.http_client, + &format!( + "{}/xrpc/com.atproto.sync.getRepo?did={}", + self.service.service_endpoint, self.common.did, + ), + Duration::from_secs(ARGS.download_repo_timeout), + ) + .await; + + let error = match get_repo_response { + Ok(resp) => { + break Ok(resp); + } + Err(error) => error, + }; + + attempts_left -= 1; + trace!( + "Failed to download repo {} with error: {}, Retrying {} more times", + self.common.did, + error, + attempts_left + ); + DOWNLOAD_REPO_RETRIES.add(1, &[]); + if attempts_left == 0 { + break Err(anyhow::anyhow!( + "Failed to download repo {} after {} attempts", + self.common.did, + ARGS.download_repo_attempts + )); + } + }?; + trace!( "Downloaded repo {} with size {:.2} MB", self.common.did, From 3b3489611acf709f7b1613b8a8dafd7f808dd801 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 6 Mar 2025 13:17:13 +0100 Subject: [PATCH 64/75] Enable debug info in release builds --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index e927459..d14b47b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,5 +84,6 @@ pumps = "0.0.4" [profile.release] lto = "thin" strip = false +debug = "full" opt-level = 3 incremental = true From b6d44c6e7a85df32882ddf644d075f8604a759bc Mon Sep 17 00:00:00 2001 From: Zebreus Date: Thu, 6 Mar 2025 18:01:02 +0100 Subject: [PATCH 65/75] Debug database performance --- src/config.rs | 9 + src/database/big_update.rs | 562 ++++++++++++++++++++++++++++++++---- src/database/definitions.rs | 18 +- 3 files changed, 520 insertions(+), 69 deletions(-) diff --git a/src/config.rs b/src/config.rs index 12716a1..d75f7a1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -71,6 +71,15 @@ pub struct Args { /// Number of DIDs the RepoStream should prefetch #[arg(long, default_value = "5000")] pub repo_stream_buffer_size: usize, + /// Maximum number of concurrent database transactions + #[arg(long, default_value = "1")] + pub max_concurrent_transactions: u32, + /// Minimum number of concurrent database transactions + #[arg(long, default_value = "1")] + pub min_concurrent_transactions: u32, + /// Minimum number of rows per database transaction + #[arg(long, default_value = "1000")] + pub min_rows_per_transaction: usize, } pub static ARGS: LazyLock = LazyLock::new(Args::parse); diff --git a/src/database/big_update.rs b/src/database/big_update.rs index 061c972..3d802c1 100644 --- a/src/database/big_update.rs +++ b/src/database/big_update.rs @@ -1,3 +1,5 @@ +use crate::config::ARGS; + use super::{ definitions::{ BskyPost, BskyPostImage, BskyPostMediaAspectRatio, BskyPostVideo, BskyPostVideoBlob, @@ -16,18 +18,23 @@ use atrium_api::{ }, }; use chrono::Utc; -use opentelemetry::metrics::{Counter, Histogram}; +use futures::lock::Mutex; +use opentelemetry::metrics::{Counter, Gauge, Histogram}; use opentelemetry::{global, KeyValue}; -use serde::Serialize; +use serde::{de::IgnoredAny, Serialize}; use serde_with::skip_serializing_none; -use std::future::IntoFuture; -use std::sync::LazyLock; use std::time::Instant; +use std::{ + error, + sync::{atomic::Ordering, LazyLock}, +}; +use std::{future::IntoFuture, sync::atomic::AtomicU32}; use surrealdb::Datetime; use surrealdb::{engine::any::Any, RecordId, Surreal}; +use tokio::sync::{Semaphore, SemaphorePermit}; use tracing::{debug, instrument, span, trace, warn, Instrument, Level}; -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateFollow { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -38,7 +45,7 @@ struct UpdateFollow { pub created_at: surrealdb::Datetime, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateLike { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -49,7 +56,7 @@ struct UpdateLike { pub created_at: surrealdb::Datetime, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateRepost { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -60,7 +67,7 @@ struct UpdateRepost { pub created_at: surrealdb::Datetime, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateBlock { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -71,7 +78,7 @@ struct UpdateBlock { pub created_at: surrealdb::Datetime, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateListBlock { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -82,7 +89,7 @@ struct UpdateListBlock { pub created_at: surrealdb::Datetime, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateListItem { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -94,7 +101,7 @@ struct UpdateListItem { } #[skip_serializing_none] -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateLatestBackfill { of: surrealdb::RecordId, id: String, @@ -102,7 +109,7 @@ struct UpdateLatestBackfill { } /// Database struct for a bluesky profile -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] #[allow(dead_code)] pub struct UpdateDid { pub id: String, @@ -124,7 +131,7 @@ pub struct UpdateDid { pub extra_data: Option, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct UpdateFeed { pub id: String, pub uri: String, @@ -141,7 +148,7 @@ pub struct UpdateFeed { pub extra_data: Option, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct UpdateList { pub id: String, pub name: String, @@ -155,7 +162,7 @@ pub struct UpdateList { pub extra_data: Option, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateQuote { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -164,7 +171,7 @@ struct UpdateQuote { pub id: String, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateRepliesRelation { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -173,7 +180,7 @@ struct UpdateRepliesRelation { pub id: String, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdateReplyToRelation { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -182,7 +189,7 @@ struct UpdateReplyToRelation { pub id: String, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct UpdatePostsRelation { #[serde(rename = "in")] pub from: surrealdb::RecordId, @@ -191,7 +198,7 @@ struct UpdatePostsRelation { pub id: String, } -#[derive(Serialize)] +#[derive(Debug, Serialize, Clone)] struct WithId { id: String, #[serde(flatten)] @@ -213,7 +220,7 @@ static QUERY_DURATION_METRIC: LazyLock> = LazyLock::new(|| { static INSERTED_ROWS_METRIC: LazyLock> = LazyLock::new(|| { global::meter("indexer") .u64_counter("indexer.database.inserted_elements") - .with_unit("rows") + .with_unit("{row}") .with_description("Inserted or updated rows") .build() }); @@ -227,10 +234,45 @@ static INSERTED_SIZE_METRIC: LazyLock> = LazyLock::new(|| { static TRANSACTIONS_METRIC: LazyLock> = LazyLock::new(|| { global::meter("indexer") .u64_counter("indexer.database.transactions") - .with_unit("By") + .with_unit("{transaction}") .with_description("Number of transactions") .build() }); +static NEWLY_DISCOVERED_DIDS_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.newly_discovered_dids") + .with_unit("{DID}") + .with_description("Number of newly discovered DIDs") + .build() +}); +static FAILED_BIG_UPDATES_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_counter("indexer.database.failed_big_updates") + .with_unit("{update}") + .with_description("Number of failed big updates. Should be always 0") + .build() +}); +static TRANSACTION_TICKETS_COST_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_gauge("indexer.database.transaction_cost") + .with_unit("{cost}") + .with_description("The current cost of holding a database transaction") + .build() +}); +static TRANSACTION_TICKETS_AVAILABLE_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_gauge("indexer.database.transaction_cost") + .with_unit("{cost}") + .with_description("The current cost of holding a database transaction") + .build() +}); +static COLLECTED_UPDATE_SIZE_METRIC: LazyLock> = LazyLock::new(|| { + global::meter("indexer") + .u64_gauge("indexer.database.collected_update_size") + .with_unit("{elements}") + .with_description("The current cost of holding a database transaction") + .build() +}); struct BigUpdateInfoRow { count: u64, @@ -368,7 +410,19 @@ impl core::fmt::Debug for BigUpdateInfo { } } -#[derive(Default)] +#[derive(Debug, Clone)] +enum UpdateState { + /// Update was applied + Applied, + /// Update was not applied, retry later + Retry, +} + +// Accumulates small updates until a big update is triggered +static SMALL_UPDATE_ACCUMULATOR: LazyLock> = + LazyLock::new(|| Mutex::new((0, BigUpdate::default()))); + +#[derive(Default, Clone)] pub struct BigUpdate { /// Insert into did did: Vec, @@ -429,19 +483,137 @@ impl BigUpdate { }); } + /// Acquire individual locks for each table + async fn acquire_locks(&self) -> Vec { + static PERMITS: LazyLock = LazyLock::new(|| 1); + static DID_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static FOLLOWS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static LATEST_BACKFILLS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static LIKES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static REPOSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static BLOCKS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static LISTBLOCKS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static LISTITEMS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static FEEDS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static LISTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static THREADGATES_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static STARTERPACKS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static POSTGATES_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static ACTORDECLARATIONS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static LABELERSERVICES_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static QUOTES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static POSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + static REPLIES_RELATIONS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static REPLY_TO_RELATIONS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static POSTS_RELATIONS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + static OVERWRITE_LATEST_BACKFILLS_SEMAPHORE: LazyLock = + LazyLock::new(|| Semaphore::new(*PERMITS)); + + let mut permits = Vec::new(); + + if !self.did.is_empty() { + permits.push(DID_SEMAPHORE.acquire().await.unwrap()); + } + if !self.follows.is_empty() { + permits.push(FOLLOWS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.latest_backfills.is_empty() { + permits.push(LATEST_BACKFILLS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.likes.is_empty() { + permits.push(LIKES_SEMAPHORE.acquire().await.unwrap()); + } + if !self.reposts.is_empty() { + permits.push(REPOSTS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.blocks.is_empty() { + permits.push(BLOCKS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.listblocks.is_empty() { + permits.push(LISTBLOCKS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.listitems.is_empty() { + permits.push(LISTITEMS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.feeds.is_empty() { + permits.push(FEEDS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.lists.is_empty() { + permits.push(LISTS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.threadgates.is_empty() { + permits.push(THREADGATES_SEMAPHORE.acquire().await.unwrap()); + } + if !self.starterpacks.is_empty() { + permits.push(STARTERPACKS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.postgates.is_empty() { + permits.push(POSTGATES_SEMAPHORE.acquire().await.unwrap()); + } + if !self.actordeclarations.is_empty() { + permits.push(ACTORDECLARATIONS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.labelerservices.is_empty() { + permits.push(LABELERSERVICES_SEMAPHORE.acquire().await.unwrap()); + } + if !self.quotes.is_empty() { + permits.push(QUOTES_SEMAPHORE.acquire().await.unwrap()); + } + if !self.posts.is_empty() { + permits.push(POSTS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.replies_relations.is_empty() { + permits.push(REPLIES_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.reply_to_relations.is_empty() { + permits.push(REPLY_TO_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.posts_relations.is_empty() { + permits.push(POSTS_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + } + if !self.overwrite_latest_backfills.is_empty() { + permits.push( + OVERWRITE_LATEST_BACKFILLS_SEMAPHORE + .acquire() + .await + .unwrap(), + ); + } + + permits + } + /// Apply this update to the database /// /// `source` is a string describing the source of the update, used for metrics - pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { + /// + /// Apply attempt with a convoluted mechanism to avoid congestion + async fn attempt_apply( + &mut self, + db: &Surreal, + source: &str, + info: &BigUpdateInfo, + ) -> Result { let start = Instant::now(); // Convert the update to a string for logging later - let info = tokio::task::block_in_place(|| self.create_info()); // Create the query string + // `RETURN VALUE none` is used to get empty return values for counting the number of inserted rows let query_string = r#" BEGIN; + INSERT IGNORE INTO latest_backfill $latest_backfills RETURN VALUE none; INSERT IGNORE INTO did $dids RETURN NONE; - INSERT IGNORE INTO latest_backfill $latest_backfills RETURN NONE; INSERT IGNORE INTO feed $feeds RETURN NONE; INSERT IGNORE INTO list $lists RETURN NONE; INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates RETURN NONE; @@ -449,7 +621,8 @@ impl BigUpdate { INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates RETURN NONE; INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations RETURN NONE; INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices RETURN NONE; - INSERT IGNORE INTO posts $posts RETURN NONE; + INSERT IGNORE INTO post $posts RETURN NONE; + INSERT RELATION INTO posts $posts_relations RETURN NONE; INSERT RELATION INTO quotes $quotes RETURN NONE; INSERT RELATION INTO like $likes RETURN NONE; INSERT RELATION INTO repost $reposts RETURN NONE; @@ -460,45 +633,129 @@ impl BigUpdate { INSERT RELATION INTO quotes $quotes RETURN NONE; INSERT RELATION INTO replies $replies_relations RETURN NONE; INSERT RELATION INTO follow $follows RETURN NONE; - INSERT INTO latest_backfill $overwrite_latest_backfill RETURN NONE; + FOR $backfill in $overwrite_latest_backfill { + UPSERT type::thing("latest_backfill", $backfill.id) MERGE $backfill; + }; COMMIT; "#; // Create the update query. Does not take that long; ~50ms for 30000 rows let update = tokio::task::block_in_place(|| { db.query(query_string) - .bind(("dids", self.did)) - .bind(("follows", self.follows)) - .bind(("latest_backfills", self.latest_backfills)) - .bind(("likes", self.likes)) - .bind(("reposts", self.reposts)) - .bind(("blocks", self.blocks)) - .bind(("listblocks", self.listblocks)) - .bind(("listitems", self.listitems)) - .bind(("feeds", self.feeds)) - .bind(("lists", self.lists)) - .bind(("threadgates", self.threadgates)) - .bind(("starterpacks", self.starterpacks)) - .bind(("postgates", self.postgates)) - .bind(("actordeclarations", self.actordeclarations)) - .bind(("labelerservices", self.labelerservices)) - .bind(("quotes", self.quotes)) - .bind(("posts", self.posts)) - .bind(("replies_relations", self.replies_relations)) - .bind(("reply_to_relations", self.reply_to_relations)) - .bind(("posts_relations", self.posts_relations)) - .bind(("overwrite_latest_backfill", self.overwrite_latest_backfills)) + .bind(("dids", self.did.clone())) + .bind(("follows", self.follows.clone())) + .bind(("latest_backfills", self.latest_backfills.clone())) + .bind(("likes", self.likes.clone())) + .bind(("reposts", self.reposts.clone())) + .bind(("blocks", self.blocks.clone())) + .bind(("listblocks", self.listblocks.clone())) + .bind(("listitems", self.listitems.clone())) + .bind(("feeds", self.feeds.clone())) + .bind(("lists", self.lists.clone())) + .bind(("threadgates", self.threadgates.clone())) + .bind(("starterpacks", self.starterpacks.clone())) + .bind(("postgates", self.postgates.clone())) + .bind(("actordeclarations", self.actordeclarations.clone())) + .bind(("labelerservices", self.labelerservices.clone())) + .bind(("quotes", self.quotes.clone())) + .bind(("posts", self.posts.clone())) + .bind(("replies_relations", self.replies_relations.clone())) + .bind(("reply_to_relations", self.reply_to_relations.clone())) + .bind(("posts_relations", self.posts_relations.clone())) + .bind(( + "overwrite_latest_backfill", + self.overwrite_latest_backfills.clone(), + )) .into_future() .instrument(span!(Level::INFO, "query")) }); let preparation_duration = start.elapsed(); let after_update = Instant::now(); - update.await?; + + // Minimum cost for a transaction in permits + static MIN_COST: u32 = 20; + // Maximum cost for a transaction in permits + static MAX_COST: LazyLock = + LazyLock::new(|| MIN_COST * ARGS.max_concurrent_transactions); + // Semaphore for limiting the number of concurrent transactions by permits + static SEMAPHORE: LazyLock = LazyLock::new(|| { + Semaphore::new(*MAX_COST as usize * ARGS.min_concurrent_transactions as usize) + }); + // The current cost of a transaction in permits + static TRANSACTION_COST: AtomicU32 = AtomicU32::new(MIN_COST); + + let base_cost = TRANSACTION_COST.load(Ordering::Relaxed); + TRANSACTION_TICKETS_COST_METRIC.record(base_cost as u64, &[]); + TRANSACTION_TICKETS_AVAILABLE_METRIC.record(SEMAPHORE.available_permits() as u64, &[]); + // A multiplier for transactions that may cause congestion + let transaction_cost_multiplier = f64::log10(10.0 + info.all().count as f64).floor() as u32; + let transaction_cost = std::cmp::min(*MAX_COST, base_cost * transaction_cost_multiplier); + let mut result = { + let _permit = SEMAPHORE.acquire_many(transaction_cost).await.unwrap(); + update.await + }?; + let errors = result.take_errors(); + + // Return retry if the transaction can be retried + if errors.len() > 0 { + let can_be_retried = errors.iter().any(|(_, e)| { + if let surrealdb::Error::Api(surrealdb::error::Api::Query(message)) = e { + message.contains("This transaction can be retried") + } else { + false + } + }); + + if can_be_retried { + // Raise the cost for each retry + TRANSACTION_COST + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| { + Some(std::cmp::min(*MAX_COST, x * 2)) + }) + .unwrap(); + + warn!("Failed but can be retried"); + return Ok(UpdateState::Retry); + } + } + + // Lower the cost for each successful transaction + TRANSACTION_COST + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| { + Some(std::cmp::max(MIN_COST, x - 1)) + }) + .unwrap(); + warn!("Cost: {}", TRANSACTION_COST.load(Ordering::Relaxed)); + let update_duration = after_update.elapsed(); QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); + + // Return error if there are any errors + if errors.len() > 0 { + FAILED_BIG_UPDATES_METRIC.add(1, &[]); + + let mut sorted_errors = errors.into_iter().collect::>(); + sorted_errors.sort_by(|(a, _), (b, _)| a.cmp(b)); + for error in &sorted_errors { + warn!("Database error: {:?}", error); + } + let first_error = &sorted_errors.first().unwrap().1; + return Err(anyhow::anyhow!("Database error: {:?}", first_error)); + } + + // At this point, we know that the update was successful + + // Record metrics info.record_metrics(source); + // Record stats about newly discovered DIDs + let newly_discovered_dids = result.take::>(0).unwrap().len(); + // warn!("Newly discovered DIDs: {}", newly_discovered_dids); + if newly_discovered_dids > 0 { + NEWLY_DISCOVERED_DIDS_METRIC.add(newly_discovered_dids as u64, &[]); + } + trace!( "Applied updated: {} elements, {}MB, {:03}ms preparation, {:03}ms applying", info.all().count, @@ -508,6 +765,199 @@ impl BigUpdate { ); debug!("Detailed infos: {:?}", info); + Ok(UpdateState::Applied) + } + + // /// apply update with individual locks for each table + // async fn attempt_apply( + // &mut self, + // db: &Surreal, + // source: &str, + // info: &BigUpdateInfo, + // ) -> Result { + // let start = Instant::now(); + // // Convert the update to a string for logging later + + // // Create the query string + // // `RETURN VALUE none` is used to get empty return values for counting the number of inserted rows + // let query_string = r#" + // BEGIN; + // INSERT IGNORE INTO latest_backfill $latest_backfills RETURN VALUE none; + // INSERT IGNORE INTO did $dids RETURN NONE; + // INSERT IGNORE INTO feed $feeds RETURN NONE; + // INSERT IGNORE INTO list $lists RETURN NONE; + // INSERT IGNORE INTO lex_app_bsky_feed_threadgate $threadgates RETURN NONE; + // INSERT IGNORE INTO lex_app_bsky_graph_starterpack $starterpacks RETURN NONE; + // INSERT IGNORE INTO lex_app_bsky_feed_postgate $postgates RETURN NONE; + // INSERT IGNORE INTO lex_chat_bsky_actor_declaration $actordeclarations RETURN NONE; + // INSERT IGNORE INTO lex_app_bsky_labeler_service $labelerservices RETURN NONE; + // INSERT IGNORE INTO post $posts RETURN NONE; + // INSERT RELATION INTO posts $posts_relations RETURN NONE; + // INSERT RELATION INTO quotes $quotes RETURN NONE; + // INSERT RELATION INTO like $likes RETURN NONE; + // INSERT RELATION INTO repost $reposts RETURN NONE; + // INSERT RELATION INTO block $blocks RETURN NONE; + // INSERT RELATION INTO listblock $listblocks RETURN NONE; + // INSERT RELATION INTO listitem $listitems RETURN NONE; + // INSERT RELATION INTO replyto $reply_to_relations RETURN NONE; + // INSERT RELATION INTO quotes $quotes RETURN NONE; + // INSERT RELATION INTO replies $replies_relations RETURN NONE; + // INSERT RELATION INTO follow $follows RETURN NONE; + // FOR $backfill in $overwrite_latest_backfill { + // UPSERT type::thing("latest_backfill", $backfill.id) MERGE $backfill; + // }; + // COMMIT; + // "#; + + // // Create the update query. Does not take that long; ~50ms for 30000 rows + // let update = tokio::task::block_in_place(|| { + // db.query(query_string) + // .bind(("dids", self.did.clone())) + // .bind(("follows", self.follows.clone())) + // .bind(("latest_backfills", self.latest_backfills.clone())) + // .bind(("likes", self.likes.clone())) + // .bind(("reposts", self.reposts.clone())) + // .bind(("blocks", self.blocks.clone())) + // .bind(("listblocks", self.listblocks.clone())) + // .bind(("listitems", self.listitems.clone())) + // .bind(("feeds", self.feeds.clone())) + // .bind(("lists", self.lists.clone())) + // .bind(("threadgates", self.threadgates.clone())) + // .bind(("starterpacks", self.starterpacks.clone())) + // .bind(("postgates", self.postgates.clone())) + // .bind(("actordeclarations", self.actordeclarations.clone())) + // .bind(("labelerservices", self.labelerservices.clone())) + // .bind(("quotes", self.quotes.clone())) + // .bind(("posts", self.posts.clone())) + // .bind(("replies_relations", self.replies_relations.clone())) + // .bind(("reply_to_relations", self.reply_to_relations.clone())) + // .bind(("posts_relations", self.posts_relations.clone())) + // .bind(( + // "overwrite_latest_backfill", + // self.overwrite_latest_backfills.clone(), + // )) + // .into_future() + // .instrument(span!(Level::INFO, "query")) + // }); + + // let preparation_duration = start.elapsed(); + // let after_update = Instant::now(); + + // let mut result = { + // let _permit = self.acquire_locks().await; + // update.await + // }?; + // let errors = result.take_errors(); + + // // Return retry if the transaction can be retried + // if errors.len() > 0 { + // let can_be_retried = errors.iter().any(|(_, e)| { + // if let surrealdb::Error::Api(surrealdb::error::Api::Query(message)) = e { + // message.contains("This transaction can be retried") + // } else { + // false + // } + // }); + + // if can_be_retried { + // // Raise the cost for each retry + // panic!("Retry not implemented"); + + // warn!("Failed but can be retried"); + // return Ok(UpdateState::Retry); + // } + // } + + // let update_duration = after_update.elapsed(); + // QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); + + // // Return error if there are any errors + // if errors.len() > 0 { + // FAILED_BIG_UPDATES_METRIC.add(1, &[]); + + // let mut sorted_errors = errors.into_iter().collect::>(); + // sorted_errors.sort_by(|(a, _), (b, _)| a.cmp(b)); + // for error in &sorted_errors { + // warn!("Database error: {:?}", error); + // } + // let first_error = &sorted_errors.first().unwrap().1; + // return Err(anyhow::anyhow!("Database error: {:?}", first_error)); + // } + + // // At this point, we know that the update was successful + + // // Record metrics + // info.record_metrics(source); + + // // Record stats about newly discovered DIDs + // let newly_discovered_dids = result.take::>(0).unwrap().len(); + // // warn!("Newly discovered DIDs: {}", newly_discovered_dids); + // if newly_discovered_dids > 0 { + // NEWLY_DISCOVERED_DIDS_METRIC.add(newly_discovered_dids as u64, &[]); + // } + + // trace!( + // "Applied updated: {} elements, {}MB, {:03}ms preparation, {:03}ms applying", + // info.all().count, + // info.all().size as f64 / 1024.0 / 1024.0, + // preparation_duration.as_millis(), + // update_duration.as_millis(), + // ); + // debug!("Detailed infos: {:?}", info); + + // Ok(UpdateState::Applied) + // } + + /// Apply this update to the database + /// + /// `source` is a string describing the source of the update, used for metrics + pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { + // Bundle small updates + let (mut update, info) = { + let info = tokio::task::block_in_place(|| self.create_info()); + + let all = info.all(); + if all.count < ARGS.min_rows_per_transaction as u64 { + // Small update + let mut lock = SMALL_UPDATE_ACCUMULATOR.lock().await; + let (count, update) = &mut *lock; + *count += all.count as usize; + COLLECTED_UPDATE_SIZE_METRIC.record(*count as u64, &[]); + update.merge(self); + if *count < ARGS.min_rows_per_transaction as usize { + return Ok(()); + } + let update = std::mem::replace(update, BigUpdate::default()); + *count = 0; + drop(lock); + let info = tokio::task::block_in_place(|| update.create_info()); + + (update, info) + } else { + (self, info) + } + }; + + let mut attempts_left = 100; + loop { + let state = update.attempt_apply(db, source, &info).await?; + match state { + UpdateState::Applied => { + break; + } + UpdateState::Retry => { + warn!("Retrying update {} attempts left", attempts_left); + attempts_left -= 1; + if attempts_left == 0 { + return Err(anyhow::anyhow!("Too many retries")); + } + } + } + } + if attempts_left < 100 { + warn!("Update successful after {} retries", 100 - attempts_left); + } + Ok(()) } @@ -861,10 +1311,7 @@ pub fn create_big_update( avatar: None, // TODO implement created_at: utils::extract_dt(&d.created_at)?, description: d.description.clone(), - labels: d - .labels - .as_ref() - .and_then(utils::extract_self_labels_list), + labels: d.labels.as_ref().and_then(utils::extract_self_labels_list), purpose: d.purpose.clone(), extra_data: process_extra_data(&d.extra_data)?, }; @@ -888,9 +1335,7 @@ pub fn create_big_update( KnownRecord::ChatBskyActorDeclaration(d) => { let did_key = utils::did_to_key(did.as_str())?; let id = format!("{}_{}", rkey.as_str(), did_key); - big_update - .actordeclarations - .push(WithId { id, data: d }); + big_update.actordeclarations.push(WithId { id, data: d }); } KnownRecord::AppBskyLabelerService(d) => { let did_key = utils::did_to_key(did.as_str())?; @@ -1008,10 +1453,7 @@ pub fn create_big_update( bridgy_original_url: None, via: None, created_at: utils::extract_dt(&d.created_at)?, - labels: d - .labels - .as_ref() - .and_then(utils::extract_self_labels_post), + labels: d.labels.as_ref().and_then(utils::extract_self_labels_post), text: d.text.clone(), langs: d .langs diff --git a/src/database/definitions.rs b/src/database/definitions.rs index 16547f8..d56dbd9 100644 --- a/src/database/definitions.rs +++ b/src/database/definitions.rs @@ -4,7 +4,7 @@ use surrealdb::{engine::any::Any, Datetime, RecordId, Surreal}; use tracing::debug; /// Database struct for a bluesky profile -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] #[allow(dead_code)] pub struct BskyProfile { #[serde(rename = "displayName")] @@ -58,7 +58,7 @@ pub struct JetstreamIdentityEvent { } /// Database struct for a bluesky post -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPost { pub author: RecordId, #[serde(rename = "bridgyOriginalUrl")] @@ -82,7 +82,7 @@ pub struct BskyPost { } /// Database struct for a bluesky post image -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPostImage { pub alt: String, pub blob: RecordId, @@ -91,7 +91,7 @@ pub struct BskyPostImage { } /// Database struct for a bluesky post video -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPostVideo { pub alt: Option, #[serde(rename = "aspectRatio")] @@ -100,10 +100,10 @@ pub struct BskyPostVideo { pub captions: Option>, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPostVideoCaption {} -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPostVideoBlob { pub cid: String, #[serde(rename = "mediaType")] @@ -112,13 +112,13 @@ pub struct BskyPostVideoBlob { } /// Database struct for a bluesky post video aspect ratio -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyPostMediaAspectRatio { pub width: u64, pub height: u64, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyFeed { pub uri: String, pub author: RecordId, @@ -134,7 +134,7 @@ pub struct BskyFeed { pub extra_data: Option, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct BskyList { pub name: String, pub purpose: String, From a56cf78316c06a158553f454abe01a93305b4d65 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 13:55:46 +0100 Subject: [PATCH 66/75] Include the cargo lockfile --- .gitignore | 1 - Cargo.lock | 6127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 6127 insertions(+), 1 deletion(-) create mode 100644 Cargo.lock diff --git a/.gitignore b/.gitignore index 382d005..6cdda3d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ # cargo files -Cargo.lock /target # ide files diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..f9bc24e --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,6127 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "Inflector" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" +dependencies = [ + "lazy_static", + "regex", +] + +[[package]] +name = "addr" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a93b8a41dbe230ad5087cc721f8d41611de654542180586b315d9f4cf6b72bef" +dependencies = [ + "psl-types", +] + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom 0.2.15", + "once_cell", + "version_check", +] + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom 0.2.15", + "once_cell", + "version_check", + "zerocopy 0.7.35", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "ammonia" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ab99eae5ee58501ab236beb6f20f6ca39be615267b014899c89b2f0bc18a459" +dependencies = [ + "html5ever", + "maplit", + "once_cell", + "tendril", + "url", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys 0.59.0", +] + +[[package]] +name = "any_ascii" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea50b14b7a4b9343f8c627a7a53c52076482bd4bdad0a24fd3ec533ed616cc2c" + +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + +[[package]] +name = "approx" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278" +dependencies = [ + "num-traits", +] + +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" + +[[package]] +name = "argon2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c3610892ee6e0cbce8ae2700349fcf8f98adb0dbfbee85aec3c9179d29cc072" +dependencies = [ + "base64ct", + "blake2", + "cpufeatures", + "password-hash", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "ascii-canvas" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8824ecca2e851cec16968d54a01dd372ef8f95b244fb84b84e70128be347c3c6" +dependencies = [ + "term", +] + +[[package]] +name = "async-channel" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-compression" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "310c9bcae737a48ef5cdee3174184e6d548b292739ede61a1f955ef76a738861" +dependencies = [ + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "async-executor" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-graphql" +version = "7.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfff2b17d272a5e3e201feda444e2c24b011fa722951268d1bd8b9b5bc6dc449" +dependencies = [ + "async-graphql-derive", + "async-graphql-parser", + "async-graphql-value", + "async-stream", + "async-trait", + "base64 0.22.1", + "bytes", + "fnv", + "futures-timer", + "futures-util", + "http", + "indexmap 2.7.1", + "mime", + "multer", + "num-traits", + "pin-project-lite", + "regex", + "serde", + "serde_json", + "serde_urlencoded", + "static_assertions_next", + "thiserror 1.0.69", +] + +[[package]] +name = "async-graphql-derive" +version = "7.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8e5d0c6697def2f79ccbd972fb106b633173a6066e430b480e1ff9376a7561a" +dependencies = [ + "Inflector", + "async-graphql-parser", + "darling", + "proc-macro-crate 3.3.0", + "proc-macro2", + "quote", + "strum", + "syn 2.0.99", + "thiserror 1.0.69", +] + +[[package]] +name = "async-graphql-parser" +version = "7.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8531ee6d292c26df31c18c565ff22371e7bdfffe7f5e62b69537db0b8fd554dc" +dependencies = [ + "async-graphql-value", + "pest", + "serde", + "serde_json", +] + +[[package]] +name = "async-graphql-value" +version = "7.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "741110dda927420a28fbc1c310543d3416f789a6ba96859c2c265843a0a96887" +dependencies = [ + "bytes", + "indexmap 2.7.1", + "serde", + "serde_json", +] + +[[package]] +name = "async-lock" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + +[[package]] +name = "async-trait" +version = "0.1.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d556ec1359574147ec0c4fc5eb525f3f23263a592b1a9c07e0a75b427de55c97" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "async_io_stream" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d7b9decdf35d8908a7e3ef02f64c5e9b1695e230154c0e8de3969142d9b94c" +dependencies = [ + "futures", + "pharos", + "rustc_version", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "atrium-api" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3ea578c768ec91082e424a8d139517b2cb5c75149bf3cec04371a1e74f00f2" +dependencies = [ + "atrium-common", + "atrium-xrpc", + "chrono", + "http", + "ipld-core", + "langtag", + "regex", + "serde", + "serde_bytes", + "serde_json", + "thiserror 1.0.69", + "trait-variant", +] + +[[package]] +name = "atrium-common" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168e558408847bfed69df1033a32fd051f7a037ebc90ea46e588ccb2bfbd7233" +dependencies = [ + "dashmap 6.1.0", + "lru", + "moka", + "thiserror 1.0.69", + "tokio", + "trait-variant", + "web-time", +] + +[[package]] +name = "atrium-xrpc" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4956d94147cfbb669c68f654eb4fd6a1d00648c810cec79d04ec5425b8f378" +dependencies = [ + "http", + "serde", + "serde_html_form", + "serde_json", + "thiserror 1.0.69", + "trait-variant", +] + +[[package]] +name = "atrium-xrpc-client" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bab4287ccef501b3892e1325280e61ae79a96eb9ee63dceabc0ed3bea35f2eb" +dependencies = [ + "atrium-xrpc", + "reqwest", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "aws-lc-rs" +version = "1.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e4e8200b9a4a5801a769d50eeabc05670fec7e959a8cb7a63a93e4e519942ae" +dependencies = [ + "aws-lc-sys", + "paste", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9dd2e03ee80ca2822dd6ea431163d2ef259f2066a4d6ccaca6d9dcb386aa43" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", + "paste", +] + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "base-x" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + +[[package]] +name = "bcrypt" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e65938ed058ef47d92cf8b346cc76ef48984572ade631927e9937b5ffc7662c7" +dependencies = [ + "base64 0.22.1", + "blowfish", + "getrandom 0.2.15", + "subtle", + "zeroize", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.12.1", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.99", + "which", +] + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake2b_simd" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06e903a20b159e944f91ec8499fe1e55651480c541ea0a584f5d967c49ad9d99" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq", +] + +[[package]] +name = "blake3" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "675f87afced0413c9bb02843499dbbd3882a237645883f71a2b59644a6d2f753" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "blowfish" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e412e2cd0f2b2d93e02543ceae7917b3c70331573df19ee046bcbc35e45e87d7" +dependencies = [ + "byteorder", + "cipher", +] + +[[package]] +name = "borsh" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5430e3be710b68d984d1391c854eb431a9d548640711faa54eecb1df93db91cc" +dependencies = [ + "borsh-derive", + "cfg_aliases", +] + +[[package]] +name = "borsh-derive" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b668d39970baad5356d7c83a86fee3a539e6f93bf6764c97368243e17a0487" +dependencies = [ + "once_cell", + "proc-macro-crate 3.3.0", + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "bytecheck" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "bytemuck" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +dependencies = [ + "serde", +] + +[[package]] +name = "castaway" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cbor4ii" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544cf8c89359205f4f990d0e6f3828db42df85b5dac95d09157a250eb0749c4" +dependencies = [ + "serde", +] + +[[package]] +name = "cc" +version = "1.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cedar-policy" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d91e3b10a0f7f2911774d5e49713c4d25753466f9e11d1cd2ec627f8a2dc857" +dependencies = [ + "cedar-policy-core", + "cedar-policy-validator", + "itertools 0.10.5", + "lalrpop-util", + "ref-cast", + "serde", + "serde_json", + "smol_str", + "thiserror 1.0.69", +] + +[[package]] +name = "cedar-policy-core" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd2315591c6b7e18f8038f0a0529f254235fd902b6c217aabc04f2459b0d9995" +dependencies = [ + "either", + "ipnet", + "itertools 0.10.5", + "lalrpop", + "lalrpop-util", + "lazy_static", + "miette", + "regex", + "rustc_lexer", + "serde", + "serde_json", + "serde_with", + "smol_str", + "stacker", + "thiserror 1.0.69", +] + +[[package]] +name = "cedar-policy-validator" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e756e1b2a5da742ed97e65199ad6d0893e9aa4bd6b34be1de9e70bd1e6adc7df" +dependencies = [ + "cedar-policy-core", + "itertools 0.10.5", + "serde", + "serde_json", + "serde_with", + "smol_str", + "stacker", + "thiserror 1.0.69", + "unicode-security", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "cid" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd94671561e36e4e7de75f753f577edafb0e7c05d6e4547229fdf7938fbcd2c3" +dependencies = [ + "core2", + "multibase", + "multihash 0.18.1", + "serde", + "unsigned-varint 0.7.2", +] + +[[package]] +name = "cid" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3147d8272e8fa0ccd29ce51194dd98f79ddfb8191ba9e3409884e751798acf3a" +dependencies = [ + "core2", + "multibase", + "multihash 0.19.3", + "serde", + "serde_bytes", + "unsigned-varint 0.8.0", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + +[[package]] +name = "colog" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c426b7af8d5e0ad79de6713996632ce31f0d68ba84068fb0d654b396e519df0" +dependencies = [ + "colored 2.2.0", + "env_logger", + "log", +] + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + +[[package]] +name = "colored" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fde0e0ec90c9dfb3b4b1a0891a7dcd0e2bffde2f7efed5fe7c9bb00e5bfb915e" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "console-api" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857" +dependencies = [ + "futures-core", + "prost", + "prost-types", + "tonic", + "tracing-core", +] + +[[package]] +name = "console-subscriber" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6539aa9c6a4cd31f4b1c040f860a1eac9aa80e7df6b05d506a6e7179936d6a01" +dependencies = [ + "console-api", + "crossbeam-channel", + "crossbeam-utils", + "futures-task", + "hdrhistogram", + "humantime", + "hyper-util", + "prost", + "prost-types", + "serde", + "serde_json", + "thread_local", + "tokio", + "tokio-stream", + "tonic", + "tracing", + "tracing-core", + "tracing-subscriber", +] + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.99", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "data-encoding" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "575f75dfd25738df5b91b8e43e14d44bda14637a58fae779fd2b064f8bf3e010" + +[[package]] +name = "data-encoding-macro" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f9724adfcf41f45bf652b3995837669d73c4d49a1b5ac1ff82905ac7d9b5558" +dependencies = [ + "data-encoding", + "data-encoding-macro-internal", +] + +[[package]] +name = "data-encoding-macro-internal" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" +dependencies = [ + "data-encoding", + "syn 2.0.99", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + +[[package]] +name = "deunicode" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "dmp" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfaa1135a34d26e5cc5b4927a8935af887d4f30a5653a797c33b9a4222beb6d9" +dependencies = [ + "urlencoding", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "earcutr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79127ed59a85d7687c409e9978547cffb7dc79675355ed22da6b66fd5f6ead01" +dependencies = [ + "itertools 0.11.0", + "num-traits", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "ena" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d248bdd43ce613d87415282f69b9bb99d947d290b10962dd6c56233312c2ad5" +dependencies = [ + "log", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "event-listener" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" +dependencies = [ + "event-listener", + "pin-project-lite", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fastwebsockets" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "305d3ba574508e27190906d11707dad683e0494e6b85eae9b044cb2734a5e422" +dependencies = [ + "base64 0.21.7", + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "pin-project", + "rand 0.8.5", + "sha1", + "simdutf8", + "thiserror 1.0.69", + "tokio", + "utf-8", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flate2" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "fst" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-lite" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-timer" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "fuzzy-matcher" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94" +dependencies = [ + "thread_local", +] + +[[package]] +name = "generator" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bd114ceda131d3b1d665eba35788690ad37f5916457286b32ab6fd3c438dd" +dependencies = [ + "cfg-if", + "libc", + "log", + "rustversion", + "windows 0.58.0", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "geo" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f811f663912a69249fa620dcd2a005db7254529da2d8a0b23942e81f47084501" +dependencies = [ + "earcutr", + "float_next_after", + "geo-types", + "geographiclib-rs", + "log", + "num-traits", + "robust", + "rstar", + "serde", + "spade", +] + +[[package]] +name = "geo-types" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd1157f0f936bf0cd68dec91e8f7c311afe60295574d62b70d4861a1bfdf2d9" +dependencies = [ + "approx 0.5.1", + "arbitrary", + "num-traits", + "rstar", + "serde", +] + +[[package]] +name = "geographiclib-rs" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e5ed84f8089c70234b0a8e0aedb6dc733671612ddc0d37c6066052f9781960" +dependencies = [ + "libm", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "h2" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.7.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "halfbrown" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8588661a8607108a5ca69cab034063441a0413a0b041c13618a7dd348021ef6f" +dependencies = [ + "hashbrown 0.14.5", + "serde", +] + +[[package]] +name = "hash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +dependencies = [ + "byteorder", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.8", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.11", + "allocator-api2", +] + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "hdrhistogram" +version = "7.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" +dependencies = [ + "base64 0.21.7", + "byteorder", + "flate2", + "nom", + "num-traits", +] + +[[package]] +name = "heapless" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" +dependencies = [ + "hash32", + "stable_deref_trait", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "http" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core 0.52.0", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexer" +version = "2.0.0" +dependencies = [ + "anyhow", + "async-channel", + "atrium-api", + "atrium-xrpc-client", + "chrono", + "clap", + "colog", + "colored 3.0.0", + "console-subscriber", + "fastwebsockets", + "futures", + "hyper", + "hyper-util", + "ipld-core", + "iroh-car", + "lazy_static", + "mimalloc", + "num_cpus", + "opentelemetry", + "opentelemetry-appender-tracing", + "opentelemetry-otlp", + "opentelemetry-resource-detectors", + "opentelemetry-semantic-conventions", + "opentelemetry-stdout", + "opentelemetry_sdk", + "pin-project-lite", + "pumps", + "regex", + "reqwest", + "rs-car-sync", + "serde", + "serde_bytes", + "serde_ipld_dagcbor", + "serde_with", + "simd-json", + "surrealdb", + "sys-info", + "sysinfo", + "tokio", + "tokio-rustls", + "tokio-util", + "tonic", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +dependencies = [ + "equivalent", + "hashbrown 0.15.2", + "serde", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + +[[package]] +name = "ipld-core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "104718b1cc124d92a6d01ca9c9258a7df311405debb3408c445a36452f9bf8db" +dependencies = [ + "cid 0.11.1", + "serde", + "serde_bytes", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iroh-car" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f8cd4cb9aa083fba8b52e921764252d0b4dcb1cd6d120b809dbfe1106e81a" +dependencies = [ + "anyhow", + "cid 0.11.1", + "futures", + "serde", + "serde_ipld_dagcbor", + "thiserror 1.0.69", + "tokio", + "unsigned-varint 0.7.2", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jsonwebtoken" +version = "9.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" +dependencies = [ + "base64 0.22.1", + "js-sys", + "pem", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + +[[package]] +name = "lalrpop" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cb077ad656299f160924eb2912aa147d7339ea7d69e1b5517326fdcec3c1ca" +dependencies = [ + "ascii-canvas", + "bit-set", + "ena", + "itertools 0.11.0", + "lalrpop-util", + "petgraph", + "pico-args", + "regex", + "regex-syntax 0.8.5", + "string_cache", + "term", + "tiny-keccak", + "unicode-xid", + "walkdir", +] + +[[package]] +name = "lalrpop-util" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507460a910eb7b32ee961886ff48539633b788a36b65692b95f225b844c82553" +dependencies = [ + "regex-automata 0.4.9", +] + +[[package]] +name = "langtag" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed60c85f254d6ae8450cec15eedd921efbc4d1bdf6fcf6202b9a58b403f6f805" +dependencies = [ + "serde", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexicmp" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7378d131ddf24063b32cbd7e91668d183140c4b3906270635a4d633d1068ea5d" +dependencies = [ + "any_ascii", +] + +[[package]] +name = "libc" +version = "0.2.170" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" + +[[package]] +name = "libipld" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1ccd6b8ffb3afee7081fcaec00e1b099fd1c7ccf35ba5729d88538fcc3b4599" +dependencies = [ + "fnv", + "libipld-cbor", + "libipld-core", + "libipld-macro", + "log", + "multihash 0.18.1", + "thiserror 1.0.69", +] + +[[package]] +name = "libipld-cbor" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77d98c9d1747aa5eef1cf099cd648c3fd2d235249f5fed07522aaebc348e423b" +dependencies = [ + "byteorder", + "libipld-core", + "thiserror 1.0.69", +] + +[[package]] +name = "libipld-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5acd707e8d8b092e967b2af978ed84709eaded82b75effe6cb6f6cc797ef8158" +dependencies = [ + "anyhow", + "cid 0.10.1", + "core2", + "multibase", + "multihash 0.18.1", + "thiserror 1.0.69", +] + +[[package]] +name = "libipld-macro" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71171c54214f866ae6722f3027f81dff0931e600e5a61e6b1b6a49ca0b5ed4ae" +dependencies = [ + "libipld-core", +] + +[[package]] +name = "libloading" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +dependencies = [ + "cfg-if", + "windows-targets", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "libmimalloc-sys" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "linfa-linalg" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e7562b41c8876d3367897067013bb2884cc78e6893f092ecd26b305176ac82" +dependencies = [ + "ndarray", + "num-traits", + "rand 0.8.5", + "thiserror 1.0.69", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "litemap" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" + +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.2", +] + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "matrixmultiply" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miette" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" +dependencies = [ + "miette-derive", + "once_cell", + "thiserror 1.0.69", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "mimalloc" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +dependencies = [ + "libmimalloc-sys", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + +[[package]] +name = "moka" +version = "0.12.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9321642ca94a4282428e6ea4af8cc2ca4eac48ac7a6a4ea8f33f76d0ce70926" +dependencies = [ + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "event-listener", + "futures-util", + "loom", + "parking_lot", + "portable-atomic", + "rustc_version", + "smallvec", + "tagptr", + "thiserror 1.0.69", + "uuid", +] + +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http", + "httparse", + "memchr", + "mime", + "spin", + "version_check", +] + +[[package]] +name = "multibase" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b3539ec3c1f04ac9748a260728e855f261b4977f5c3406612c884564f329404" +dependencies = [ + "base-x", + "data-encoding", + "data-encoding-macro", +] + +[[package]] +name = "multihash" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd8a792c1694c6da4f68db0a9d707c72bd260994da179e6030a5dcee00bb815" +dependencies = [ + "core2", + "multihash-derive", + "unsigned-varint 0.7.2", +] + +[[package]] +name = "multihash" +version = "0.19.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b430e7953c29dd6a09afc29ff0bb69c6e306329ee6794700aee27b76a1aea8d" +dependencies = [ + "core2", + "serde", + "unsigned-varint 0.8.0", +] + +[[package]] +name = "multihash-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6d4752e6230d8ef7adf7bd5d8c4b1f6561c1014c5ba9a37445ccefe18aa1db" +dependencies = [ + "proc-macro-crate 1.1.3", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure 0.12.6", +] + +[[package]] +name = "nanoid" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ffa00dec017b5b1a8b7cf5e2c008bfda1aa7e0697ac1508b491fdf2622fb4d8" +dependencies = [ + "rand 0.8.5", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "approx 0.4.0", + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "ndarray-stats" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af5a8477ac96877b5bd1fd67e0c28736c12943aba24eda92b127e036b0c8f400" +dependencies = [ + "indexmap 1.9.3", + "itertools 0.10.5", + "ndarray", + "noisy_float", + "num-integer", + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + +[[package]] +name = "noisy_float" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af" +dependencies = [ + "num-traits", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.13.0", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "opentelemetry" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "236e667b670a5cdf90c258f5a55794ec5ac5027e960c224bff8367a59e1e6426" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.12", + "tracing", +] + +[[package]] +name = "opentelemetry-appender-tracing" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c513c7af3bec30113f3d4620134ff923295f1e9c580fda2b8abe0831f925ddc0" +dependencies = [ + "opentelemetry", + "tracing", + "tracing-core", + "tracing-subscriber", +] + +[[package]] +name = "opentelemetry-http" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8863faf2910030d139fb48715ad5ff2f35029fc5f244f6d5f689ddcf4d26253" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry", + "reqwest", + "tracing", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bef114c6d41bea83d6dc60eb41720eedd0261a67af57b66dd2b84ac46c01d91" +dependencies = [ + "async-trait", + "futures-core", + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "reqwest", + "thiserror 2.0.12", + "tokio", + "tonic", + "tracing", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f8870d3024727e99212eb3bb1762ec16e255e3e6f58eeb3dc8db1aa226746d" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", +] + +[[package]] +name = "opentelemetry-resource-detectors" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0cd3cf373f6f7f3a8f25a189acf1300c8b87e85f7959b45ba83c01e305f5cc3" +dependencies = [ + "opentelemetry", + "opentelemetry-semantic-conventions", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fb3a2f78c2d55362cd6c313b8abedfbc0142ab3c2676822068fd2ab7d51f9b7" + +[[package]] +name = "opentelemetry-stdout" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb0e5a5132e4b80bf037a78e3e12c8402535199f5de490d0c38f7eac71bc831" +dependencies = [ + "async-trait", + "chrono", + "futures-util", + "opentelemetry", + "opentelemetry_sdk", + "serde", + "thiserror 2.0.12", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84dfad6042089c7fc1f6118b7040dc2eb4ab520abbf410b79dc481032af39570" +dependencies = [ + "async-trait", + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "opentelemetry", + "percent-encoding", + "rand 0.8.5", + "serde_json", + "thiserror 2.0.12", + "tokio", + "tokio-stream", + "tracing", +] + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "password-hash" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" +dependencies = [ + "base64ct", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "path-clean" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17359afc20d7ab31fdb42bb844c8b3bb1dabd7dcf7e68428492da7f16966fcef" + +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + +[[package]] +name = "pem" +version = "3.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38af38e8470ac9dee3ce1bae1af9c1671fffc44ddfd8bd1d0a3445bf349a8ef3" +dependencies = [ + "base64 0.22.1", + "serde", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pest" +version = "2.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b7cafe60d6cf8e62e1b9b2ea516a089c008945bb5a275416789e7db0bc199dc" +dependencies = [ + "memchr", + "thiserror 2.0.12", + "ucd-trie", +] + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.7.1", +] + +[[package]] +name = "pharos" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9567389417feee6ce15dd6527a8a1ecac205ef62c2932bcf3d9f6fc5b78b414" +dependencies = [ + "futures", + "rustc_version", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.99", + "unicase", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", + "unicase", +] + +[[package]] +name = "pico-args" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy 0.7.35", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "prettyplease" +version = "0.2.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1ccf34da56fc294e7d4ccf69a85992b7dfb826b7cf57bac6a70bba3494cc08a" +dependencies = [ + "proc-macro2", + "syn 2.0.99", +] + +[[package]] +name = "proc-macro-crate" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e17d47ce914bf4de440332250b0edd23ce48c005f59fab39d3335866b114f11a" +dependencies = [ + "thiserror 1.0.69", + "toml", +] + +[[package]] +name = "proc-macro-crate" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" +dependencies = [ + "toml_edit", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost", +] + +[[package]] +name = "psl-types" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" + +[[package]] +name = "psm" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" +dependencies = [ + "cc", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "pumps" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6a3b38c98c390b6b543e242b6765421d6339cee7a9a4ee2211622c56f516ed" +dependencies = [ + "futures", + "tokio", +] + +[[package]] +name = "quick_cache" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb55a1aa7668676bb93926cd4e9cdfe60f03bb866553bcca9112554911b6d3dc" +dependencies = [ + "ahash 0.8.11", + "equivalent", + "hashbrown 0.14.5", + "parking_lot", +] + +[[package]] +name = "quinn" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.1", + "rustls", + "socket2", + "thiserror 2.0.12", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" +dependencies = [ + "bytes", + "getrandom 0.2.15", + "rand 0.8.5", + "ring", + "rustc-hash 2.1.1", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.12", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.59.0", +] + +[[package]] +name = "quote" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", + "serde", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", + "zerocopy 0.8.23", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.15", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.1", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "reblessive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffead9d0a0b45f3e0bc063a244b1779fd53a09d2c2f7282c186a016b1f10a778" + +[[package]] +name = "redox_syscall" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b8c0c260b63a8219631167be35e6a988e9554dbd323f8bd08439c8ed1302bd1" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom 0.2.15", + "libredox", + "thiserror 1.0.69", +] + +[[package]] +name = "ref-cast" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rend" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "reqwest" +version = "0.12.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" +dependencies = [ + "async-compression", + "base64 0.22.1", + "bytes", + "encoding_rs", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "mime_guess", + "once_cell", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-rustls", + "tokio-util", + "tower 0.5.2", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", + "windows-registry", +] + +[[package]] +name = "revision" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b8ee532f15b2f0811eb1a50adf10d036e14a6cdae8d99893e7f3b921cb227d" +dependencies = [ + "chrono", + "geo", + "regex", + "revision-derive", + "roaring", + "rust_decimal", + "uuid", +] + +[[package]] +name = "revision-derive" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3415e1bc838c36f9a0a2ac60c0fa0851c72297685e66592c44870d82834dfa2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "ring" +version = "0.17.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rkyv" +version = "0.7.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b" +dependencies = [ + "bitvec", + "bytecheck", + "bytes", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "rmp" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" +dependencies = [ + "byteorder", + "num-traits", + "paste", +] + +[[package]] +name = "rmpv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58450723cd9ee93273ce44a20b6ec4efe17f8ed2e3631474387bfdecf18bb2a9" +dependencies = [ + "num-traits", + "rmp", +] + +[[package]] +name = "roaring" +version = "0.10.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652edd001c53df0b3f96a36a8dc93fce6866988efc16808235653c6bcac8bf2" +dependencies = [ + "bytemuck", + "byteorder", + "serde", +] + +[[package]] +name = "robust" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf4a6aa5f6d6888f39e980649f3ad6b666acdce1d78e95b8a2cb076e687ae30" + +[[package]] +name = "rs-car-sync" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a9d56a8c5158018c0e7695013625a959e77b9fa62e5ca8a6a02506b4341862" +dependencies = [ + "blake2b_simd", + "libipld", + "sha2", +] + +[[package]] +name = "rstar" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "rust_decimal" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "rand 0.8.5", + "rkyv", + "serde", + "serde_json", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc_lexer" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c86aae0c77166108c01305ee1a36a1e77289d7dc6ca0a3cd91ff4992de2d16a5" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustls" +version = "0.23.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" +dependencies = [ + "web-time", +] + +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scrypt" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f" +dependencies = [ + "password-hash", + "pbkdf2", + "salsa20", + "sha2", +] + +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + +[[package]] +name = "security-framework" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" +dependencies = [ + "bitflags", + "core-foundation 0.10.0", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" +dependencies = [ + "serde", +] + +[[package]] +name = "send_wrapper" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" + +[[package]] +name = "serde" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde-content" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3753ca04f350fa92d00b6146a3555e63c55388c9ef2e11e09bce2ff1c0b509c6" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_bytes" +version = "0.11.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "364fec0df39c49a083c9a8a18a23a6bcfd9af130fe9fe321d18520a0d113e09e" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "serde_html_form" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4" +dependencies = [ + "form_urlencoded", + "indexmap 2.7.1", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_ipld_dagcbor" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6851dcd54a7271dd9013195fdccbdaba70c8e71014364e396d4b938d0e67f324" +dependencies = [ + "cbor4ii", + "ipld-core", + "scopeguard", + "serde", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "indexmap 2.7.1", + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa" +dependencies = [ + "base64 0.22.1", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.7.1", + "serde", + "serde_derive", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "simd-json" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2bcf6c6e164e81bc7a5d49fc6988b3d515d9e8c07457d7b74ffb9324b9cd40" +dependencies = [ + "getrandom 0.2.15", + "halfbrown", + "ref-cast", + "serde", + "serde_json", + "simdutf8", + "value-trait", +] + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "simple_asn1" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror 2.0.12", + "time", +] + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" + +[[package]] +name = "smol_str" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd538fb6910ac1099850255cf94a94df6551fbdd602454387d0adb2d1ca6dead" +dependencies = [ + "serde", +] + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spade" +version = "2.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ece03ff43cd2a9b57ebf776ea5e78bd30b3b4185a619f041079f4109f385034" +dependencies = [ + "hashbrown 0.15.2", + "num-traits", + "robust", + "smallvec", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "stacker" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9156ebd5870ef293bfb43f91c7a74528d363ec0d424afe24160ed5a4343d08a" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "static_assertions_next" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7beae5182595e9a8b683fa98c4317f956c9a2dec3b9716990d20023cc60c766" + +[[package]] +name = "storekey" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c42833834a5d23b344f71d87114e0cc9994766a5c42938f4b50e7b2aef85b2" +dependencies = [ + "byteorder", + "memchr", + "serde", + "thiserror 1.0.69", +] + +[[package]] +name = "string_cache" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938d512196766101d333398efde81bc1f37b00cb42c2f8350e5df639f040bbbe" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.99", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "surrealdb" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa1aa87197bad9dd12c93350533a8da09bae064a411f445c97ca0e64faabc304" +dependencies = [ + "arrayvec", + "async-channel", + "bincode", + "chrono", + "dmp", + "futures", + "geo", + "indexmap 2.7.1", + "path-clean", + "pharos", + "reblessive", + "reqwest", + "revision", + "ring", + "rust_decimal", + "rustls", + "rustls-pki-types", + "semver", + "serde", + "serde-content", + "serde_json", + "surrealdb-core", + "thiserror 1.0.69", + "tokio", + "tokio-tungstenite", + "tokio-util", + "tracing", + "trice", + "url", + "uuid", + "wasm-bindgen-futures", + "wasmtimer", + "ws_stream_wasm", +] + +[[package]] +name = "surrealdb-core" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ceb9f421e07af67b06c57ea7a8b08c3b4e4677de483435ca6d69bd00600a571" +dependencies = [ + "addr", + "ahash 0.8.11", + "ammonia", + "any_ascii", + "argon2", + "async-channel", + "async-executor", + "async-graphql", + "base64 0.21.7", + "bcrypt", + "bincode", + "blake3", + "bytes", + "castaway", + "cedar-policy", + "chrono", + "ciborium", + "dashmap 5.5.3", + "deunicode", + "dmp", + "fst", + "futures", + "fuzzy-matcher", + "geo", + "geo-types", + "hex", + "http", + "ipnet", + "jsonwebtoken", + "lexicmp", + "linfa-linalg", + "md-5", + "nanoid", + "ndarray", + "ndarray-stats", + "num-traits", + "num_cpus", + "object_store", + "parking_lot", + "pbkdf2", + "pharos", + "phf", + "pin-project-lite", + "quick_cache", + "radix_trie", + "rand 0.8.5", + "rayon", + "reblessive", + "regex", + "revision", + "ring", + "rmpv", + "roaring", + "rust-stemmers", + "rust_decimal", + "scrypt", + "semver", + "serde", + "serde-content", + "serde_json", + "sha1", + "sha2", + "snap", + "storekey", + "strsim", + "subtle", + "sysinfo", + "thiserror 1.0.69", + "tokio", + "tracing", + "trice", + "ulid", + "unicase", + "url", + "uuid", + "vart", + "wasm-bindgen-futures", + "wasmtimer", + "ws_stream_wasm", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-xid", +] + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "sys-info" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "sysinfo" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "rayon", + "windows 0.57.0", +] + +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl 2.0.12", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "time" +version = "0.3.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dad298b01a40a23aac4580b67e3dbedb7cc8402f3592d7f49469de2ea4aecdd8" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c97a5b985b7c11d7bc27fa927dc4fe6af3a6dfb021d28deb60d3bf51e76ef" + +[[package]] +name = "time-macros" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8093bc3e81c3bc5f7879de09619d06c9a5a5e45ca44dfeeb7225bae38005c5c" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9975ea0f48b5aa3972bf2d888c238182458437cc2a19374b81b25cdf1023fb3a" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "tracing", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6989540ced10490aaf14e6bad2e3d33728a2813310a0c71d1574304c49631cd" +dependencies = [ + "futures-util", + "log", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tungstenite", + "webpki-roots", +] + +[[package]] +name = "tokio-util" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" +dependencies = [ + "bytes", + "futures-core", + "futures-io", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" + +[[package]] +name = "toml_edit" +version = "0.22.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" +dependencies = [ + "indexmap 2.7.1", + "toml_datetime", + "winnow", +] + +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost", + "socket2", + "tokio", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.5", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "721f2d2569dce9f3dfbbddee5906941e953bfcdf736a62da3377f5751650cc36" +dependencies = [ + "js-sys", + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "trait-variant" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70977707304198400eb4835a78f6a9f928bf41bba420deb8fdb175cd965d77a7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "trice" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3aaab10ae9fac0b10f392752bf56f0fd20845f39037fec931e8537b105b515a" +dependencies = [ + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "tungstenite" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e2ce1e47ed2994fd43b04c8f618008d4cabdd5ee34027cf14f9d918edd9c8" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.8.5", + "rustls", + "rustls-pki-types", + "sha1", + "thiserror 1.0.69", + "url", + "utf-8", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "ucd-trie" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" + +[[package]] +name = "ulid" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab82fc73182c29b02e2926a6df32f2241dbadb5cfc111fd595515b3598f46bb3" +dependencies = [ + "rand 0.9.0", + "serde", + "web-time", +] + +[[package]] +name = "unicase" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-script" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb421b350c9aff471779e262955939f565ec18b86c15364e6bdf0d662ca7c1f" + +[[package]] +name = "unicode-security" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e4ddba1535dd35ed8b61c52166b7155d7f4e4b8847cec6f48e71dc66d8b5e50" +dependencies = [ + "unicode-normalization", + "unicode-script", +] + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "unsigned-varint" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6889a77d49f1f013504cec6bf97a2c730394adedaeb1deb5ea08949a50541105" + +[[package]] +name = "unsigned-varint" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb066959b24b5196ae73cb057f45598450d2c5f71460e98c49b738086eff9c06" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" +dependencies = [ + "getrandom 0.3.1", + "js-sys", + "serde", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "value-trait" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9170e001f458781e92711d2ad666110f153e4e50bfd5cbd02db6547625714187" +dependencies = [ + "float-cmp", + "halfbrown", + "itoa", + "ryu", +] + +[[package]] +name = "vart" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87782b74f898179396e93c0efabb38de0d58d50bbd47eae00c71b3a1144dbbae" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.99", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmtimer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7ed9d8b15c7fb594d72bfb4b5a276f3d2029333cd93a932f376f5937f6f80ee" +dependencies = [ + "futures", + "js-sys", + "parking_lot", + "pin-utils", + "wasm-bindgen", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "0.26.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets", +] + +[[package]] +name = "windows" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" +dependencies = [ + "windows-core 0.58.0", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result 0.1.2", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" +dependencies = [ + "windows-implement 0.58.0", + "windows-interface 0.58.0", + "windows-result 0.2.0", + "windows-strings", + "windows-targets", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "windows-implement" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "windows-interface" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "windows-link" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dccfd733ce2b1753b03b6d3c65edf020262ea35e20ccdf3e288043e6dd620e3" + +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result 0.2.0", + "windows-strings", + "windows-targets", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result 0.2.0", + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "winnow" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7f4ea97f6f78012141bcdb6a216b2609f0979ada50b20ca5b52dde2eac2bb1" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags", +] + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "ws_stream_wasm" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7999f5f4217fe3818726b66257a4475f71e74ffd190776ad053fa159e50737f5" +dependencies = [ + "async_io_stream", + "futures", + "js-sys", + "log", + "pharos", + "rustc_version", + "send_wrapper", + "thiserror 1.0.69", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", + "synstructure 0.13.1", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive 0.7.35", +] + +[[package]] +name = "zerocopy" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd97444d05a4328b90e75e503a34bad781f14e28a823ad3557f0750df1ebcbc6" +dependencies = [ + "zerocopy-derive 0.8.23", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", + "synstructure 0.13.1", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.99", +] From 21c3ef82cd80983b5675a273be5786935d57d3cd Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 13:56:38 +0100 Subject: [PATCH 67/75] Include .vscode --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index 6cdda3d..ea8c4bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1 @@ -# cargo files /target - -# ide files -.vscode From 24abf6665484d0a598dd18ea7f1dc7fc12f78bee Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 14:10:26 +0100 Subject: [PATCH 68/75] Improve release profile for maximum performance --- Cargo.toml | 12 +++++++++++- README.md | 10 +++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d14b47b..d134562 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,8 +82,18 @@ pin-project-lite = "0.2.16" pumps = "0.0.4" [profile.release] +# Enable lto for best performance +lto = "fat" +# One codegen unit for better intracrate optimization +codegen-units = 1 +# Include some debug info for better crash reports +strip = false +debug = "line-tables-only" + +# Profile with lto for testing performance in development +[profile.dev-lto] +inherits = "release" lto = "thin" strip = false debug = "full" -opt-level = 3 incremental = true diff --git a/README.md b/README.md index 54d647a..ce8eec8 100644 --- a/README.md +++ b/README.md @@ -17,18 +17,18 @@ You may need to increase the ulimit for the number of open files. You can do thi ## Debugging and profiling +For benchmarking during development us the `dev-lto` profile. It should provide a reasonable compromise between build-time and runtime performance. To run the indexer with the `dev-lto` profile run `cargo run --profile dev-lto`. + ### tokio -You can use tokio-console to get more insights into what the tokio tasks are currently doing. Just run `tokio-console` while the indexer is running. +You can use tokio-console to get more insights into what the tokio tasks are currently doing. To enable Just run `tokio-console` while the indexer is running. ### opentelemetry -The application uses opentelemetry for metrics, traces, and logs. It exports signal via the OTLP grpc protocol. You can configure the exporter with the usual opentelemetry environment variables. - -The spin up a docker container with a collector and grafana use +The application uses opentelemetry for metrics, traces, and logs. It exports signal via the OTLP grpc protocol. You can configure the exporter with the usual opentelemetry environment variables. The spin up a docker container with a collector and grafana use: ``` docker run -p 3000:3000 -p 4317:4317 --rm -ti grafana/otel-lgtm ``` -and then visit `localhost:3000` and login as `admin` with password `admin`. +and then visit `localhost:3000`. To disable opentelemetry use the `--no-otel-logs` and `--no-otel-metrics` flags. From 4e2f4f48e559402b5a8549c37727e5336fcdac1a Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 14:34:40 +0100 Subject: [PATCH 69/75] Fix warnings --- src/database/big_update.rs | 235 ++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 118 deletions(-) diff --git a/src/database/big_update.rs b/src/database/big_update.rs index 3d802c1..579bff0 100644 --- a/src/database/big_update.rs +++ b/src/database/big_update.rs @@ -23,15 +23,12 @@ use opentelemetry::metrics::{Counter, Gauge, Histogram}; use opentelemetry::{global, KeyValue}; use serde::{de::IgnoredAny, Serialize}; use serde_with::skip_serializing_none; +use std::sync::{atomic::Ordering, LazyLock}; use std::time::Instant; -use std::{ - error, - sync::{atomic::Ordering, LazyLock}, -}; use std::{future::IntoFuture, sync::atomic::AtomicU32}; use surrealdb::Datetime; use surrealdb::{engine::any::Any, RecordId, Surreal}; -use tokio::sync::{Semaphore, SemaphorePermit}; +use tokio::sync::Semaphore; use tracing::{debug, instrument, span, trace, warn, Instrument, Level}; #[derive(Debug, Serialize, Clone)] @@ -483,116 +480,118 @@ impl BigUpdate { }); } - /// Acquire individual locks for each table - async fn acquire_locks(&self) -> Vec { - static PERMITS: LazyLock = LazyLock::new(|| 1); - static DID_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static FOLLOWS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static LATEST_BACKFILLS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static LIKES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static REPOSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static BLOCKS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static LISTBLOCKS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static LISTITEMS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static FEEDS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static LISTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static THREADGATES_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static STARTERPACKS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static POSTGATES_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static ACTORDECLARATIONS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static LABELERSERVICES_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static QUOTES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static POSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); - static REPLIES_RELATIONS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static REPLY_TO_RELATIONS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static POSTS_RELATIONS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - static OVERWRITE_LATEST_BACKFILLS_SEMAPHORE: LazyLock = - LazyLock::new(|| Semaphore::new(*PERMITS)); - - let mut permits = Vec::new(); - - if !self.did.is_empty() { - permits.push(DID_SEMAPHORE.acquire().await.unwrap()); - } - if !self.follows.is_empty() { - permits.push(FOLLOWS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.latest_backfills.is_empty() { - permits.push(LATEST_BACKFILLS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.likes.is_empty() { - permits.push(LIKES_SEMAPHORE.acquire().await.unwrap()); - } - if !self.reposts.is_empty() { - permits.push(REPOSTS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.blocks.is_empty() { - permits.push(BLOCKS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.listblocks.is_empty() { - permits.push(LISTBLOCKS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.listitems.is_empty() { - permits.push(LISTITEMS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.feeds.is_empty() { - permits.push(FEEDS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.lists.is_empty() { - permits.push(LISTS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.threadgates.is_empty() { - permits.push(THREADGATES_SEMAPHORE.acquire().await.unwrap()); - } - if !self.starterpacks.is_empty() { - permits.push(STARTERPACKS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.postgates.is_empty() { - permits.push(POSTGATES_SEMAPHORE.acquire().await.unwrap()); - } - if !self.actordeclarations.is_empty() { - permits.push(ACTORDECLARATIONS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.labelerservices.is_empty() { - permits.push(LABELERSERVICES_SEMAPHORE.acquire().await.unwrap()); - } - if !self.quotes.is_empty() { - permits.push(QUOTES_SEMAPHORE.acquire().await.unwrap()); - } - if !self.posts.is_empty() { - permits.push(POSTS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.replies_relations.is_empty() { - permits.push(REPLIES_RELATIONS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.reply_to_relations.is_empty() { - permits.push(REPLY_TO_RELATIONS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.posts_relations.is_empty() { - permits.push(POSTS_RELATIONS_SEMAPHORE.acquire().await.unwrap()); - } - if !self.overwrite_latest_backfills.is_empty() { - permits.push( - OVERWRITE_LATEST_BACKFILLS_SEMAPHORE - .acquire() - .await - .unwrap(), - ); - } + // /// Acquire individual locks for each table + // /// + // /// Currently unused + // async fn acquire_locks(&self) -> Vec { + // static PERMITS: LazyLock = LazyLock::new(|| 1); + // static DID_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static FOLLOWS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LATEST_BACKFILLS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LIKES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static REPOSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static BLOCKS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LISTBLOCKS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LISTITEMS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static FEEDS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LISTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static THREADGATES_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static STARTERPACKS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static POSTGATES_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static ACTORDECLARATIONS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static LABELERSERVICES_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static QUOTES_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static POSTS_SEMAPHORE: LazyLock = LazyLock::new(|| Semaphore::new(*PERMITS)); + // static REPLIES_RELATIONS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static REPLY_TO_RELATIONS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static POSTS_RELATIONS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + // static OVERWRITE_LATEST_BACKFILLS_SEMAPHORE: LazyLock = + // LazyLock::new(|| Semaphore::new(*PERMITS)); + + // let mut permits = Vec::new(); + + // if !self.did.is_empty() { + // permits.push(DID_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.follows.is_empty() { + // permits.push(FOLLOWS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.latest_backfills.is_empty() { + // permits.push(LATEST_BACKFILLS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.likes.is_empty() { + // permits.push(LIKES_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.reposts.is_empty() { + // permits.push(REPOSTS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.blocks.is_empty() { + // permits.push(BLOCKS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.listblocks.is_empty() { + // permits.push(LISTBLOCKS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.listitems.is_empty() { + // permits.push(LISTITEMS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.feeds.is_empty() { + // permits.push(FEEDS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.lists.is_empty() { + // permits.push(LISTS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.threadgates.is_empty() { + // permits.push(THREADGATES_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.starterpacks.is_empty() { + // permits.push(STARTERPACKS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.postgates.is_empty() { + // permits.push(POSTGATES_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.actordeclarations.is_empty() { + // permits.push(ACTORDECLARATIONS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.labelerservices.is_empty() { + // permits.push(LABELERSERVICES_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.quotes.is_empty() { + // permits.push(QUOTES_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.posts.is_empty() { + // permits.push(POSTS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.replies_relations.is_empty() { + // permits.push(REPLIES_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.reply_to_relations.is_empty() { + // permits.push(REPLY_TO_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.posts_relations.is_empty() { + // permits.push(POSTS_RELATIONS_SEMAPHORE.acquire().await.unwrap()); + // } + // if !self.overwrite_latest_backfills.is_empty() { + // permits.push( + // OVERWRITE_LATEST_BACKFILLS_SEMAPHORE + // .acquire() + // .await + // .unwrap(), + // ); + // } - permits - } + // permits + // } /// Apply this update to the database /// @@ -698,7 +697,7 @@ impl BigUpdate { let errors = result.take_errors(); // Return retry if the transaction can be retried - if errors.len() > 0 { + if !errors.is_empty() { let can_be_retried = errors.iter().any(|(_, e)| { if let surrealdb::Error::Api(surrealdb::error::Api::Query(message)) = e { message.contains("This transaction can be retried") @@ -732,7 +731,7 @@ impl BigUpdate { QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); // Return error if there are any errors - if errors.len() > 0 { + if !errors.is_empty() { FAILED_BIG_UPDATES_METRIC.add(1, &[]); let mut sorted_errors = errors.into_iter().collect::>(); @@ -924,10 +923,10 @@ impl BigUpdate { *count += all.count as usize; COLLECTED_UPDATE_SIZE_METRIC.record(*count as u64, &[]); update.merge(self); - if *count < ARGS.min_rows_per_transaction as usize { + if *count < ARGS.min_rows_per_transaction { return Ok(()); } - let update = std::mem::replace(update, BigUpdate::default()); + let update = std::mem::take(update); *count = 0; drop(lock); let info = tokio::task::block_in_place(|| update.create_info()); From 2b35f8b8014d1d60cc3c27cd88b87c69fa138fe6 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 14:44:34 +0100 Subject: [PATCH 70/75] Add option to use a bundled root certificate --- ISRG_Root_X1.pem | 31 +++++++++++++++++++++++++++++++ src/config.rs | 6 +++--- src/jetstream_consumer.rs | 18 ++++-------------- src/main.rs | 2 +- src/websocket/mod.rs | 35 +++++++++++++++-------------------- 5 files changed, 54 insertions(+), 38 deletions(-) create mode 100644 ISRG_Root_X1.pem diff --git a/ISRG_Root_X1.pem b/ISRG_Root_X1.pem new file mode 100644 index 0000000..b85c803 --- /dev/null +++ b/ISRG_Root_X1.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/src/config.rs b/src/config.rs index d75f7a1..44b46c9 100644 --- a/src/config.rs +++ b/src/config.rs @@ -5,9 +5,9 @@ use std::sync::LazyLock; #[derive(Parser, Debug)] #[command(about)] pub struct Args { - /// Certificate to check jetstream server against - #[arg(short = 'c', long, default_value = "/etc/ssl/certs/ISRG_Root_X1.pem")] - pub certificate: String, + /// Path to a certificate to check jetstream server against. By default the bundled ISRG Root X1 certificate is used. + #[arg(short = 'c', long)] + pub certificate: Option, /// Set the tokio threadpool size. The default value is the number of cores available to the system. #[arg(long)] pub threads: Option, diff --git a/src/jetstream_consumer.rs b/src/jetstream_consumer.rs index eb451cd..79e0a22 100644 --- a/src/jetstream_consumer.rs +++ b/src/jetstream_consumer.rs @@ -12,16 +12,10 @@ const JETSTREAM_HOSTS: [&str; 5] = [ "jetstream1.us-east.bsky.network", ]; -pub async fn attach_jetstream(db: Surreal, certificate: String) -> anyhow::Result<()> { +pub async fn attach_jetstream(db: Surreal) -> anyhow::Result<()> { let mut jetstream_tasks = JETSTREAM_HOSTS .iter() - .map(|host| { - tokio::task::spawn(start_jetstream_consumer( - db.clone(), - host.to_string(), - certificate.clone(), - )) - }) + .map(|host| tokio::task::spawn(start_jetstream_consumer(db.clone(), host.to_string()))) .collect::>(); loop { @@ -37,11 +31,7 @@ pub async fn attach_jetstream(db: Surreal, certificate: String) -> anyhow:: Ok(()) } -async fn start_jetstream_consumer( - db: Surreal, - host: String, - certificate: String, -) -> anyhow::Result<()> { +async fn start_jetstream_consumer(db: Surreal, host: String) -> anyhow::Result<()> { // fetch initial cursor let cursor = database::fetch_cursor(&db, &host) .await @@ -49,7 +39,7 @@ async fn start_jetstream_consumer( .map_or(0, |e| e.time_us); // enter websocket event loop - websocket::start(host, certificate, cursor, db) + websocket::start(host, cursor, db) .await .context("WebSocket event loop failed")?; diff --git a/src/main.rs b/src/main.rs index d5c8619..19d9cfc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -71,7 +71,7 @@ async fn application_main() -> anyhow::Result<()> { // Create tasks let metrics_task = export_system_metrics().boxed(); - let jetstream_task = attach_jetstream(db.to_owned(), ARGS.certificate.clone()).boxed(); + let jetstream_task = attach_jetstream(db.to_owned()).boxed(); let indexer_task = start_full_repo_indexer(db.to_owned()).boxed_local(); // Add all tasks to a list diff --git a/src/websocket/mod.rs b/src/websocket/mod.rs index 7427e6e..ebc3d7b 100644 --- a/src/websocket/mod.rs +++ b/src/websocket/mod.rs @@ -20,6 +20,8 @@ use tokio_rustls::{ }; use tracing::{debug, info, trace, warn}; +use crate::config::ARGS; + mod conn; pub mod events; mod handler; @@ -40,28 +42,21 @@ impl SharedState { } /// Subscribe to a websocket server -pub async fn start( - host: String, - certificate: String, - cursor: u64, - db: Surreal, -) -> anyhow::Result<()> { +pub async fn start(host: String, cursor: u64, db: Surreal) -> anyhow::Result<()> { // prepare tls store - let cloned_certificate_path = certificate.clone(); - debug!(target: "indexer", "Creating tls store for certificate: {}", cloned_certificate_path); let mut tls_store = RootCertStore::empty(); - let tls_cert = CertificateDer::from_pem_file(certificate).with_context(|| { - format!( - "Unable to parse certificate from: {}", - cloned_certificate_path - ) - })?; - tls_store.add(tls_cert).with_context(|| { - format!( - "Unable to add certificate to tls store: {}", - cloned_certificate_path - ) - })?; + let tls_cert = if let Some(certificate) = &ARGS.certificate { + debug!(target: "indexer", "Using the root certificate from {}", &certificate); + CertificateDer::from_pem_file(certificate) + .with_context(|| format!("Unable to parse certificate from: {}", certificate))? + } else { + debug!(target: "indexer", "Using the bundled ISRG Root X1 certificate"); + CertificateDer::from_pem_slice(include_bytes!("../../ISRG_Root_X1.pem")) + .with_context(|| "Unable to bundled certificate")? + }; + tls_store + .add(tls_cert) + .with_context(|| "Unable to add certificate to tls store.")?; let tls_config = Arc::new( ClientConfig::builder() .with_root_certificates(Arc::new(tls_store)) From b80f917ffcbdd9ed3fde5f0a5a7a52e8ebca24c9 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 14:45:14 +0100 Subject: [PATCH 71/75] Add a default database server url --- src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index 44b46c9..dee641a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -13,7 +13,7 @@ pub struct Args { pub threads: Option, /// Endpoint of the database server (including port and protocol) /// You can specify multiple surrealdbs by repeating this argument, but they should all point to the same underlying datastore - #[arg(short = 'D', long, num_args=1..=16)] + #[arg(short = 'D', long, num_args=1..=16, default_value = "ws://127.0.0.1:8000")] pub db: Vec, /// Username for the database server #[arg(short, long, default_value = "root")] From 79d8d18fe0666e719b1ee599d46ea039d9d3b718 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 14:57:27 +0100 Subject: [PATCH 72/75] Document the congestion control mechanism --- src/database/big_update.rs | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/database/big_update.rs b/src/database/big_update.rs index 579bff0..ebdfb8a 100644 --- a/src/database/big_update.rs +++ b/src/database/big_update.rs @@ -605,8 +605,6 @@ impl BigUpdate { info: &BigUpdateInfo, ) -> Result { let start = Instant::now(); - // Convert the update to a string for logging later - // Create the query string // `RETURN VALUE none` is used to get empty return values for counting the number of inserted rows let query_string = r#" @@ -672,6 +670,13 @@ impl BigUpdate { let preparation_duration = start.elapsed(); let after_update = Instant::now(); + // What follows is a complex mechanism to limit the number of concurrent transactions. We need to do this ourselves because surrealdb just drops conflicting transactions. + // The mechanism is as follows: + + // We have a given budget of permits that can be used for transactions. Each transaction costs a certain amount of permits, the bigger the transaction, the more permits it costs. + // + // The base cost of a transaction is increased, when transactions are dropped due to congestion, and decreased when transactions are successful. + // Minimum cost for a transaction in permits static MIN_COST: u32 = 20; // Maximum cost for a transaction in permits @@ -714,7 +719,7 @@ impl BigUpdate { }) .unwrap(); - warn!("Failed but can be retried"); + trace!("Transaction can be retried"); return Ok(UpdateState::Retry); } } @@ -725,7 +730,10 @@ impl BigUpdate { Some(std::cmp::max(MIN_COST, x - 1)) }) .unwrap(); - warn!("Cost: {}", TRANSACTION_COST.load(Ordering::Relaxed)); + trace!( + "Current cost for a transaction is {}", + TRANSACTION_COST.load(Ordering::Relaxed) + ); let update_duration = after_update.elapsed(); QUERY_DURATION_METRIC.record(update_duration.as_millis() as u64, &[]); @@ -911,7 +919,8 @@ impl BigUpdate { /// /// `source` is a string describing the source of the update, used for metrics pub async fn apply(self, db: &Surreal, source: &str) -> Result<()> { - // Bundle small updates + // If updates are too small, we add them into an accumulator and return here. + // The accumulated updates will be flushed when it is big enough. let (mut update, info) = { let info = tokio::task::block_in_place(|| self.create_info()); @@ -937,6 +946,7 @@ impl BigUpdate { } }; + // This number is really big, because updates should always succeed after a few retries let mut attempts_left = 100; loop { let state = update.attempt_apply(db, source, &info).await?; @@ -945,16 +955,18 @@ impl BigUpdate { break; } UpdateState::Retry => { - warn!("Retrying update {} attempts left", attempts_left); + trace!("Retrying update {} attempts left", attempts_left); attempts_left -= 1; if attempts_left == 0 { - return Err(anyhow::anyhow!("Too many retries")); + return Err(anyhow::anyhow!( + "Failed to apply an update after 100 retries. This needs investigation." + )); } } } } if attempts_left < 100 { - warn!("Update successful after {} retries", 100 - attempts_left); + trace!("Update successful after {} retries", 100 - attempts_left); } Ok(()) From 18337989f284827ea92882c6c86d181fcb114a05 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 16:18:14 +0100 Subject: [PATCH 73/75] Add dockerized deployment --- .cargo/config.toml | 9 +++- .dockerignore | 2 + .gitignore | 1 + Cargo.toml | 5 +++ Dockerfile | 26 +++++++++++ README.md | 18 +++++--- docker-compose-deployment.yml | 85 +++++++++++++++++++++++++++++++++++ docker-compose.yml | 31 +++++++++++++ grafana/Dockerfile | 3 ++ 9 files changed, 173 insertions(+), 7 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 docker-compose-deployment.yml create mode 100644 docker-compose.yml create mode 100644 grafana/Dockerfile diff --git a/.cargo/config.toml b/.cargo/config.toml index 8df42a5..8c150d5 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,9 @@ [build] -rustflags = ["--cfg", "tokio_unstable", "-C", "target-cpu=native"] +rustflags = [ + "--cfg", + "tokio_unstable", + "-C", + "target-cpu=native", + "--cfg", + "tokio_tracing", +] diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a727c0a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +/target +/data diff --git a/.gitignore b/.gitignore index ea8c4bf..a727c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/data diff --git a/Cargo.toml b/Cargo.toml index d134562..28fad30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,10 @@ edition = "2021" authors = ["redsolver", "PancakeTAS"] description = "ATProto/Bluesky Indexer powered by SurrealDB and Jetstream" +[[bin]] +name = "indexer" +path = "src/main.rs" + [dependencies] anyhow = "1.0.96" hyper = "1.6.0" @@ -93,6 +97,7 @@ debug = "line-tables-only" # Profile with lto for testing performance in development [profile.dev-lto] inherits = "release" +codegen-units = 16 lto = "thin" strip = false debug = "full" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..08a81c0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM rustlang/rust:nightly-slim AS builder + +WORKDIR /app + +# Download dependencies +COPY Cargo.lock . +COPY Cargo.toml . +COPY .cargo ./.cargo +RUN cargo fetch --locked + +# Build a dummy project to cache dependencies +RUN mkdir src && echo "fn main() {}" >src/main.rs +RUN cargo build --locked --offline --release +RUN rm -rf src + +# Copy the source code +COPY ISRG_Root_X1.pem . +COPY src ./src + +# Build the project +RUN touch src/main.rs && cargo build --locked --offline --release + +FROM rustlang/rust:nightly-slim AS indexer + +COPY --from=builder /app/target/release/indexer /bin/indexer +ENTRYPOINT ["/bin/indexer"] diff --git a/README.md b/README.md index ce8eec8..2aa64c0 100644 --- a/README.md +++ b/README.md @@ -6,18 +6,24 @@ The indexer attaches a websocket to a Jetstream endpoint and converts all receiv The database can then be used to run powerful queries on the network data or build advanced custom feeds. All skyfeed.xyz feeds are powered by this service. -## Installation +## Development 1. Install the latest stable rust compiler from [rustup.rs](https://rustup.rs/). -2. Install either onto your system or into a docker container a [SurrealDB](https://surrealdb.com/docs/surrealdb/installation/running). -3. Clone the repository and run `cargo build --release`. -4. Launch the indexer with `./target/release/skyfeed-indexer [--help]`. +2. Make sure you have `docker` and `docker-compose` installed. +3. Start a surrealdb and a grafana instance with `docker-compose up`. (Use -d to run in the background) +4. Launch the indexer with `cargo run --profile dev-lto --`. -You may need to increase the ulimit for the number of open files. You can do this by running `ulimit -n 1000000`. +## Deployment + +1. Make sure you have `docker` and `docker-compose` installed. +2. Clone this repository. +3. Adjust the commented lines in `docker-compose-deployment.yml`. +4. Build and start the indexer, database, and monitoring with `docker-compose -f docker-compose-deployment.yml up`. +5. Access the monitoring dashboard at `https://your-domain`. ## Debugging and profiling -For benchmarking during development us the `dev-lto` profile. It should provide a reasonable compromise between build-time and runtime performance. To run the indexer with the `dev-lto` profile run `cargo run --profile dev-lto`. +For benchmarking during development use the `dev-lto` profile. It should provide a reasonable compromise between build-time and runtime performance. To run the indexer with the `dev-lto` profile run `cargo run --profile dev-lto`. ### tokio diff --git a/docker-compose-deployment.yml b/docker-compose-deployment.yml new file mode 100644 index 0000000..dcb8fb5 --- /dev/null +++ b/docker-compose-deployment.yml @@ -0,0 +1,85 @@ +services: + traefik: + image: "traefik:v3.3" + container_name: "traefik" + command: + - "--api.insecure=true" + - "--providers.docker=true" + - "--providers.docker.exposedbydefault=false" + - "--entryPoints.websecure.address=:443" + - "--certificatesresolvers.myresolver.acme.tlschallenge=true" + # Insert your email here + - "--certificatesresolvers.myresolver.acme.email=example@example.com" + - "--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json" + ports: + - "443:443" + volumes: + - "/data/letsencrypt:/letsencrypt" + - "/var/run/docker.sock:/var/run/docker.sock:ro" + grafana: + build: + context: grafana + dockerfile: Dockerfile + container_name: grafana + volumes: + - /data/grafana/grafana:/data/grafana + - /data/grafana/prometheus:/data/prometheus + - /data/grafana/loki:/loki + environment: + GF_PATHS_DATA: /data/grafana + restart: always + ports: + - "127.0.0.1:4317:4317" + - "127.0.0.1:3000:3000" + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/4317" + interval: 30s + timeout: 15s + retries: 3 + labels: + - "traefik.enable=true" + # Insert your domain here + - "traefik.http.routers.grafana.rule=Host(`monitoring.indexer.skyfeedlol.lol`)" + - "traefik.http.routers.grafana.entrypoints=websecure" + - "traefik.http.routers.grafana.tls.certresolver=myresolver" + - "traefik.http.routers.grafana.middlewares=auth" + # Generate a new password with `htpasswd -nB -C4 grafana` + - "traefik.http.middlewares.auth.basicauth.users=grafana:$$2y$$04$$qxpOMWJdp4vMc2Z1u4afaeykArMJw7Y2cz.JeNKTVx/TfPu31TvQK" + - "traefik.http.services.grafana.loadbalancer.server.port=3000" + surrealdb: + image: surrealdb/surrealdb:latest + container_name: surrealdb + ports: + - "127.0.0.1:8000:8000" + user: root + command: + - start + - -A + - --user=root + - --pass=root + - rocksdb://rocksdb + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/8000" + interval: 10s + timeout: 5s + retries: 10 + volumes: + - /data/surreal/rocksdb:/data/rocksdb + restart: always + indexer: + depends_on: + surrealdb: + condition: service_started + + grafana: + condition: service_healthy + build: + context: . + dockerfile: Dockerfile + container_name: indexer + command: + - --db=ws://surrealdb:8000 + # Insert additional flags here + environment: + OTEL_EXPORTER_OTLP_ENDPOINT: "http://grafana:4317" + restart: always diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8aada74 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + grafana: + build: + context: grafana + dockerfile: Dockerfile + container_name: grafana + volumes: + - ./data/grafana/grafana:/data/grafana + - ./data/grafana/prometheus:/data/prometheus + - ./data/grafana/loki:/loki + environment: + GF_PATHS_DATA: /data/grafana + restart: always + ports: + - "127.0.0.1:3000:3000" + - "127.0.0.1:4317:4317" + surrealdb: + image: surrealdb/surrealdb:latest + container_name: surrealdb + ports: + - "127.0.0.1:8000:8000" + user: root + command: + - start + - -A + - --user=root + - --pass=root + - rocksdb://rocksdb + volumes: + - ./data/surreal/rocksdb:/data/rocksdb + restart: always diff --git a/grafana/Dockerfile b/grafana/Dockerfile new file mode 100644 index 0000000..a73225c --- /dev/null +++ b/grafana/Dockerfile @@ -0,0 +1,3 @@ +FROM grafana/otel-lgtm + +RUN sed -i 's/timeInterval: 60s/timeInterval: 1s/' /otel-lgtm/grafana/conf/provisioning/datasources/grafana-datasources.yaml From 88f4af93a084f3a303366b5c2710790ff4d6a5a1 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 21:33:54 +0100 Subject: [PATCH 74/75] Adjust deployment docker-compose --- docker-compose-deployment.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker-compose-deployment.yml b/docker-compose-deployment.yml index dcb8fb5..0179fb6 100644 --- a/docker-compose-deployment.yml +++ b/docker-compose-deployment.yml @@ -35,7 +35,7 @@ services: test: bash -c "exec 6<> /dev/tcp/localhost/4317" interval: 30s timeout: 15s - retries: 3 + retries: 5 labels: - "traefik.enable=true" # Insert your domain here @@ -57,12 +57,12 @@ services: - -A - --user=root - --pass=root - - rocksdb://rocksdb + - rocksdb:///data/rocksdb healthcheck: test: bash -c "exec 6<> /dev/tcp/localhost/8000" - interval: 10s - timeout: 5s - retries: 10 + interval: 30s + timeout: 15s + retries: 5 volumes: - /data/surreal/rocksdb:/data/rocksdb restart: always From 1325640967f97c30c633aefa83e936bd23ad62f6 Mon Sep 17 00:00:00 2001 From: Zebreus Date: Sun, 9 Mar 2025 22:29:24 +0100 Subject: [PATCH 75/75] Fix grafana collector interval --- grafana/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grafana/Dockerfile b/grafana/Dockerfile index a73225c..ecf3202 100644 --- a/grafana/Dockerfile +++ b/grafana/Dockerfile @@ -1,3 +1,3 @@ FROM grafana/otel-lgtm -RUN sed -i 's/timeInterval: 60s/timeInterval: 1s/' /otel-lgtm/grafana/conf/provisioning/datasources/grafana-datasources.yaml +RUN sed -i 's/timeInterval: 60s/timeInterval: 5s/' /otel-lgtm/grafana/conf/provisioning/datasources/grafana-datasources.yaml