From 57be17c5457ec1848beaa497d29bf8d850b73db3 Mon Sep 17 00:00:00 2001 From: Kurtis Wright Date: Fri, 12 Jun 2026 04:14:00 +0000 Subject: [PATCH] feat(catalog): add typed CatalogType selector for built-in catalogs Add a `CatalogType` enum to the catalog loader that lets callers select a built-in catalog by a declared variant (`CatalogType::Rest`) instead of a bare type string, with an inherent `async fn load` that dispatches via an exhaustive match. Including `Memory` closes a gap where the in-memory catalog was absent from the string registry. This is purely additive: the existing free `load(&str)`, `supported_types()`, `CatalogLoader`, and `BoxedCatalogBuilder` are left unchanged and continue to work alongside the new selector. This approach will be repeated for how file format's registry will be implemented. Applying the approach here is aimed at improving the Catalog selection to be more idiomatic and to get alignment from the community on how we want to structure registries. --- crates/catalog/loader/public-api.txt | 19 ++ crates/catalog/loader/src/catalog_type.rs | 203 ++++++++++++++++++++++ crates/catalog/loader/src/lib.rs | 4 + 3 files changed, 226 insertions(+) create mode 100644 crates/catalog/loader/src/catalog_type.rs diff --git a/crates/catalog/loader/public-api.txt b/crates/catalog/loader/public-api.txt index c9e50207f4..2022da4762 100644 --- a/crates/catalog/loader/public-api.txt +++ b/crates/catalog/loader/public-api.txt @@ -1,4 +1,23 @@ pub mod iceberg_catalog_loader +pub enum iceberg_catalog_loader::CatalogType +pub iceberg_catalog_loader::CatalogType::Glue +pub iceberg_catalog_loader::CatalogType::Hms +pub iceberg_catalog_loader::CatalogType::Memory +pub iceberg_catalog_loader::CatalogType::Rest +pub iceberg_catalog_loader::CatalogType::S3Tables +pub iceberg_catalog_loader::CatalogType::Sql +impl iceberg_catalog_loader::CatalogType +pub async fn iceberg_catalog_loader::CatalogType::load(self, name: impl core::convert::Into, props: std::collections::hash::map::HashMap, storage_factory: core::option::Option>) -> iceberg::error::Result> +impl core::clone::Clone for iceberg_catalog_loader::CatalogType +pub fn iceberg_catalog_loader::CatalogType::clone(&self) -> iceberg_catalog_loader::CatalogType +impl core::cmp::Eq for iceberg_catalog_loader::CatalogType +impl core::cmp::PartialEq for iceberg_catalog_loader::CatalogType +pub fn iceberg_catalog_loader::CatalogType::eq(&self, other: &iceberg_catalog_loader::CatalogType) -> bool +impl core::fmt::Debug for iceberg_catalog_loader::CatalogType +pub fn iceberg_catalog_loader::CatalogType::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::hash::Hash for iceberg_catalog_loader::CatalogType +pub fn iceberg_catalog_loader::CatalogType::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +impl core::marker::StructuralPartialEq for iceberg_catalog_loader::CatalogType pub struct iceberg_catalog_loader::CatalogLoader<'a> impl iceberg_catalog_loader::CatalogLoader<'_> pub async fn iceberg_catalog_loader::CatalogLoader<'_>::load(self, name: alloc::string::String, props: std::collections::hash::map::HashMap) -> iceberg::error::Result> diff --git a/crates/catalog/loader/src/catalog_type.rs b/crates/catalog/loader/src/catalog_type.rs new file mode 100644 index 0000000000..79aa026e5c --- /dev/null +++ b/crates/catalog/loader/src/catalog_type.rs @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Typed selection of the built-in catalog implementations. + +use std::collections::HashMap; +use std::sync::Arc; + +use iceberg::io::StorageFactory; +use iceberg::memory::MemoryCatalogBuilder; +use iceberg::{Catalog, Result}; +use iceberg_catalog_glue::GlueCatalogBuilder; +use iceberg_catalog_hms::HmsCatalogBuilder; +use iceberg_catalog_rest::RestCatalogBuilder; +use iceberg_catalog_s3tables::S3TablesCatalogBuilder; +use iceberg_catalog_sql::SqlCatalogBuilder; + +use crate::BoxedCatalogBuilder; + +/// A declared selector for one of the built-in catalog implementations. +/// +/// Use this over a bare type string when the catalog is known ahead of +/// time: the compiler then checks the selection exhaustively, and a typo +/// cannot silently fall through to an "unsupported type" error at runtime. +// +// Derive `Clone` but not `Copy`: a future revision may add a string-carrying +// escape-hatch variant for out-of-tree catalogs, which would not be `Copy`. +// Omitting `Copy` now keeps that addition from removing a derive that downstream +// code may have come to rely on. `load` takes `self` by value, so it works +// regardless. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum CatalogType { + /// REST catalog. + Rest, + /// AWS Glue catalog. + Glue, + /// Amazon S3 Tables catalog. + S3Tables, + /// Hive Metastore catalog. + Hms, + /// SQL catalog. + Sql, + /// In-memory catalog. + Memory, +} + +impl CatalogType { + /// Construct the selected catalog and load it from `name` and `props`. + /// + /// `storage_factory` controls how the catalog builds + /// [`FileIO`](iceberg::io::FileIO) for data and metadata files: + /// + /// - `Some(factory)` is forwarded to whichever catalog is selected, so the + /// same call site can pin one storage backend across every variant. + /// - `None` defers to the catalog's own backend. [`Glue`](CatalogType::Glue) + /// and [`S3Tables`](CatalogType::S3Tables) are AWS-native and default to + /// S3; [`Memory`](CatalogType::Memory) defaults to in-memory storage. + /// Each of those has one unambiguous backend, so it loads. The + /// storage-agnostic catalogs ([`Rest`](CatalogType::Rest), + /// [`Sql`](CatalogType::Sql), and [`Hms`](CatalogType::Hms)) have no + /// inherent backend; rather than silently writing to some fabricated + /// default location, they require a factory and return an error when one + /// is not supplied. + /// + /// This method never substitutes a stand-in storage location: a load either + /// uses a backend the caller can name, or it fails. + pub async fn load( + self, + name: impl Into, + props: HashMap, + storage_factory: Option>, + ) -> Result> { + // Exhaustive match: adding a variant without an arm is a compile error, + // so the set of loadable built-ins cannot drift out of sync with the enum. + let builder: Box = match self { + CatalogType::Rest => Box::new(RestCatalogBuilder::default()), + CatalogType::Glue => Box::new(GlueCatalogBuilder::default()), + CatalogType::S3Tables => Box::new(S3TablesCatalogBuilder::default()), + CatalogType::Hms => Box::new(HmsCatalogBuilder::default()), + CatalogType::Sql => Box::new(SqlCatalogBuilder::default()), + CatalogType::Memory => Box::new(MemoryCatalogBuilder::default()), + }; + + // Forward the caller's factory when present and otherwise leave the + // builder untouched. No fallback factory is injected here: catalogs with + // one unambiguous backend (Glue/S3Tables on S3, Memory in-memory) apply + // it themselves, while the storage-agnostic catalogs (Sql/Hms) surface + // an error rather than papering over a missing backend with a silent + // default location. + let builder = match storage_factory { + Some(factory) => builder.with_storage_factory(factory), + None => builder, + }; + builder.load(name.into(), props).await + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use iceberg::ErrorKind; + use iceberg::io::MemoryStorageFactory; + use iceberg::memory::MEMORY_CATALOG_WAREHOUSE; + use iceberg_catalog_rest::REST_CATALOG_PROP_URI; + use tempfile::TempDir; + + use super::CatalogType; + + // Building a REST catalog is lazy: the builder validates props and stores + // them without contacting the server or constructing FileIO, so this needs + // no running service and no storage factory. (REST resolves its backend on + // first file access, which is its own upstream design, not a default this + // selector injects.) + #[tokio::test] + async fn test_rest_loads_without_service() { + CatalogType::Rest + .load( + "rest", + HashMap::from([( + REST_CATALOG_PROP_URI.to_string(), + "http://localhost:8080".to_string(), + )]), + None, + ) + .await + .expect("REST should build without a storage factory"); + } + + // The memory catalog was previously absent from the string registry; this + // exercises that it is now reachable through the typed selector. + #[tokio::test] + async fn test_memory_loads_via_typed_selector() { + let warehouse = TempDir::new().expect("temp dir should be creatable"); + CatalogType::Memory + .load( + "memory", + HashMap::from([( + MEMORY_CATALOG_WAREHOUSE.to_string(), + warehouse + .path() + .to_str() + .expect("temp dir path should be valid UTF-8") + .to_string(), + )]), + None, + ) + .await + .expect("memory catalog should load via the typed selector"); + } + + // SQL is storage-agnostic: it has no inherent backend, so loading it without + // a factory must fail rather than silently fall back to a fabricated + // location. The missing-factory check in `SqlCatalog::new` fires before any + // database connection, so no sqlite setup is needed to observe it. + #[tokio::test] + async fn test_sql_errors_without_storage_factory() { + let err = CatalogType::Sql + .load("sql", HashMap::new(), None) + .await + .expect_err("SQL must refuse to load without a storage factory"); + + assert_eq!(err.kind(), ErrorKind::Unexpected); + } + + // A caller-supplied factory must be threaded through to a catalog that has + // its own default (here Memory, whose default is in-memory storage). The + // override is accepted rather than rejected or ignored. + #[tokio::test] + async fn test_explicit_factory_overrides_default() { + let warehouse = TempDir::new().expect("temp dir should be creatable"); + CatalogType::Memory + .load( + "memory", + HashMap::from([( + MEMORY_CATALOG_WAREHOUSE.to_string(), + warehouse + .path() + .to_str() + .expect("temp dir path should be valid UTF-8") + .to_string(), + )]), + Some(Arc::new(MemoryStorageFactory)), + ) + .await + .expect("memory catalog should accept an explicit storage factory"); + } +} diff --git a/crates/catalog/loader/src/lib.rs b/crates/catalog/loader/src/lib.rs index f681337736..86985b0508 100644 --- a/crates/catalog/loader/src/lib.rs +++ b/crates/catalog/loader/src/lib.rs @@ -27,6 +27,10 @@ use iceberg_catalog_rest::RestCatalogBuilder; use iceberg_catalog_s3tables::S3TablesCatalogBuilder; use iceberg_catalog_sql::SqlCatalogBuilder; +mod catalog_type; + +pub use catalog_type::CatalogType; + /// A CatalogBuilderFactory creating a new catalog builder. type CatalogBuilderFactory = fn() -> Box;