From 18bd3202e0b621ef5740ee66612ed767b1615518 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Thu, 29 Jan 2026 19:26:57 +0800 Subject: [PATCH 1/2] feat: integration s3 with arrow filesystem --- .github/workflows/test.yml | 36 +++ ci/scripts/start_minio.sh | 127 ++++++++++ .../IcebergThirdpartyToolchain.cmake | 1 + src/iceberg/CMakeLists.txt | 3 + src/iceberg/arrow/arrow_file_io.h | 17 ++ src/iceberg/arrow/arrow_s3_file_io.cc | 236 ++++++++++++++++++ src/iceberg/catalog/rest/rest_catalog.cc | 35 +++ src/iceberg/catalog/rest/rest_catalog.h | 24 ++ src/iceberg/test/CMakeLists.txt | 2 + src/iceberg/test/arrow_s3_file_io_test.cc | 226 +++++++++++++++++ .../test/rest_catalog_integration_test.cc | 75 ++++++ 11 files changed, 782 insertions(+) create mode 100644 ci/scripts/start_minio.sh create mode 100644 src/iceberg/arrow/arrow_s3_file_io.cc create mode 100644 src/iceberg/test/arrow_s3_file_io_test.cc diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5eb26e06b..c60f42b1d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,12 +41,24 @@ jobs: name: AMD64 Ubuntu 24.04 runs-on: ubuntu-24.04 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 - name: Install dependencies shell: bash run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash env: @@ -63,9 +75,21 @@ jobs: name: AArch64 macOS 26 runs-on: macos-26 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash run: ci/scripts/build_iceberg.sh $(pwd) @@ -76,6 +100,15 @@ jobs: name: AMD64 Windows 2025 runs-on: windows-2025 timeout-minutes: 60 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 @@ -85,6 +118,9 @@ jobs: vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows - name: Setup sccache uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: cmd env: diff --git a/ci/scripts/start_minio.sh b/ci/scripts/start_minio.sh new file mode 100644 index 000000000..e3f416509 --- /dev/null +++ b/ci/scripts/start_minio.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eux + +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}" +MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:RELEASE.2024-12-18T00-00-00Z}" +MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}" +MINIO_PORT="${MINIO_PORT:-9000}" +MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}" +MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}" + +wait_for_minio() { + for i in {1..30}; do + if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then + return 0 + fi + sleep 1 + done + return 1 +} + +start_minio_docker() { + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then + docker rm -f "${MINIO_CONTAINER_NAME}" + fi + + docker run -d --name "${MINIO_CONTAINER_NAME}" \ + -p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \ + -e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \ + -e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \ + "${MINIO_IMAGE}" \ + server /data --console-address ":${MINIO_CONSOLE_PORT}" + + wait_for_minio +} + +start_minio_macos() { + if ! command -v brew >/dev/null 2>&1; then + echo "brew is required to start MinIO on macOS without Docker" >&2 + return 1 + fi + + brew install minio + minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + +download_mc() { + local uname_out + uname_out="$(uname -s)" + + local mc_dir + mc_dir="${RUNNER_TEMP:-/tmp}" + mkdir -p "${mc_dir}" + + case "${uname_out}" in + Linux*) + MC_BIN="${mc_dir}/mc" + curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}" + chmod +x "${MC_BIN}" + ;; + Darwin*) + MC_BIN="${mc_dir}/mc" + curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}" + chmod +x "${MC_BIN}" + ;; + MINGW*|MSYS*|CYGWIN*) + MC_BIN="${mc_dir}/mc.exe" + curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}" + ;; + *) + echo "Unsupported OS for mc: ${uname_out}" >&2 + return 1 + ;; + esac +} + +create_bucket() { + download_mc + for i in {1..30}; do + if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then + break + fi + sleep 1 + done + "${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}" +} + +case "$(uname -s)" in + Darwin*) + if ! start_minio_docker; then + start_minio_macos + fi + ;; + Linux*|MINGW*|MSYS*|CYGWIN*) + start_minio_docker + ;; + *) + echo "Unsupported OS: $(uname -s)" >&2 + exit 1 + ;; +esac + +create_bucket diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 8b32eb749..6a6d5ea85 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -102,6 +102,7 @@ function(resolve_arrow_dependency) # Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*) set(ARROW_IPC ON) set(ARROW_FILESYSTEM ON) + set(ARROW_S3 ON) set(ARROW_JSON ON) set(ARROW_PARQUET ON) set(ARROW_SIMD_LEVEL "NONE") diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index b503a41ea..e74b11586 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -40,6 +40,7 @@ set(ICEBERG_SOURCES expression/rewrite_not.cc expression/strict_metrics_evaluator.cc expression/term.cc + file_io_registry.cc file_reader.cc file_writer.cc inheritable_metadata.cc @@ -176,6 +177,8 @@ add_subdirectory(util) if(ICEBERG_BUILD_BUNDLE) set(ICEBERG_BUNDLE_SOURCES arrow/arrow_fs_file_io.cc + arrow/arrow_s3_file_io.cc + arrow/file_io_register.cc arrow/metadata_column_util.cc avro/avro_data_util.cc avro/avro_direct_decoder.cc diff --git a/src/iceberg/arrow/arrow_file_io.h b/src/iceberg/arrow/arrow_file_io.h index 12a9b2303..514881b11 100644 --- a/src/iceberg/arrow/arrow_file_io.h +++ b/src/iceberg/arrow/arrow_file_io.h @@ -20,9 +20,12 @@ #pragma once #include +#include +#include #include "iceberg/file_io.h" #include "iceberg/iceberg_bundle_export.h" +#include "iceberg/result.h" namespace iceberg::arrow { @@ -30,4 +33,18 @@ ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeMockFileIO(); ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeLocalFileIO(); +/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem. +/// +/// This function initializes the S3 subsystem if not already initialized (thread-safe). +/// The S3 initialization is done once per process using std::call_once. +/// +/// \param uri An S3 URI (must start with "s3://") used to validate the scheme. +/// \param properties Optional configuration properties for S3 access. See S3Properties +/// for available keys (credentials, region, endpoint, timeouts, etc.). +/// \return A FileIO instance for S3 operations, or an error if S3 is not supported +/// or the URI is invalid. +ICEBERG_BUNDLE_EXPORT Result> MakeS3FileIO( + const std::string& uri, + const std::unordered_map& properties = {}); + } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/arrow_s3_file_io.cc b/src/iceberg/arrow/arrow_s3_file_io.cc new file mode 100644 index 000000000..9090d7b35 --- /dev/null +++ b/src/iceberg/arrow/arrow_s3_file_io.cc @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#include +#if __has_include() +#include +#define ICEBERG_ARROW_HAS_S3 1 +#else +#define ICEBERG_ARROW_HAS_S3 0 +#endif + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/arrow/s3_properties.h" +#include "iceberg/util/macros.h" + +namespace iceberg::arrow { + +namespace { + +bool IsS3Uri(std::string_view uri) { return uri.rfind("s3://", 0) == 0; } + +Status EnsureS3Initialized() { +#if ICEBERG_ARROW_HAS_S3 + static std::once_flag init_flag; + static ::arrow::Status init_status = ::arrow::Status::OK(); + std::call_once(init_flag, []() { + ::arrow::fs::S3GlobalOptions options; + init_status = ::arrow::fs::InitializeS3(options); + if (init_status.ok()) { + std::atexit([]() { (void)::arrow::fs::FinalizeS3(); }); + } + }); + if (!init_status.ok()) { + return std::unexpected{ + {.kind = ::iceberg::arrow::ToErrorKind(init_status), + .message = init_status.ToString()}}; + } + return {}; +#else + return NotImplemented("Arrow S3 support is not enabled"); +#endif +} + +#if ICEBERG_ARROW_HAS_S3 +/// \brief Configure S3Options from a properties map. +/// +/// \param properties The configuration properties map. +/// \return Configured S3Options. +::arrow::fs::S3Options ConfigureS3Options( + const std::unordered_map& properties) { + ::arrow::fs::S3Options options; + + // Configure credentials + auto access_key_it = properties.find(S3Properties::kAccessKeyId); + auto secret_key_it = properties.find(S3Properties::kSecretAccessKey); + auto session_token_it = properties.find(S3Properties::kSessionToken); + + if (access_key_it != properties.end() && secret_key_it != properties.end()) { + if (session_token_it != properties.end()) { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second, + session_token_it->second); + } else { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second); + } + } else { + // Use default credential chain (environment, instance profile, etc.) + options.ConfigureDefaultCredentials(); + } + + // Configure region + auto region_it = properties.find(S3Properties::kRegion); + if (region_it != properties.end()) { + options.region = region_it->second; + } + + // Configure endpoint (for MinIO, LocalStack, etc.) + auto endpoint_it = properties.find(S3Properties::kEndpoint); + if (endpoint_it != properties.end()) { + options.endpoint_override = endpoint_it->second; + } + + // Configure path-style access (needed for MinIO) + auto path_style_it = properties.find(S3Properties::kPathStyleAccess); + if (path_style_it != properties.end()) { + // Arrow's S3 path-style is controlled via endpoint scheme + // For path-style access, we need to ensure the endpoint is properly configured + } + + // Configure SSL + auto ssl_it = properties.find(S3Properties::kSslEnabled); + if (ssl_it != properties.end() && ssl_it->second == "false") { + options.scheme = "http"; + } + + // Configure timeouts + auto connect_timeout_it = properties.find(S3Properties::kConnectTimeoutMs); + if (connect_timeout_it != properties.end()) { + options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0; + } + + auto socket_timeout_it = properties.find(S3Properties::kSocketTimeoutMs); + if (socket_timeout_it != properties.end()) { + options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0; + } + + return options; +} + +/// \brief Create an S3 FileSystem with the given options. +/// +/// \param options The S3Options to use. +/// \return A shared_ptr to the S3FileSystem, or an error. +Result> MakeS3FileSystem( + const ::arrow::fs::S3Options& options) { + ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); + return fs; +} +#endif + +Result> ResolveFileSystemFromUri( + const std::string& uri, std::string* out_path) { + if (IsS3Uri(uri)) { + ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); + } + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::FileSystemFromUri(uri, out_path)); + return fs; +} + +/// \brief ArrowUriFileIO resolves FileSystem from URI for each operation. +/// +/// This implementation is thread-safe as it creates a new FileSystem instance +/// for each operation. However, it may be less efficient than caching the +/// FileSystem. S3 initialization is done once per process. +class ArrowUriFileIO : public FileIO { + public: + Result ReadFile(const std::string& file_location, + std::optional length) override { + std::string path; + ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); + ::arrow::fs::FileInfo file_info(path); + if (length.has_value()) { + file_info.set_size(length.value()); + } + std::string content; + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenInputFile(file_info)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file_size, file->GetSize()); + + content.resize(file_size); + size_t remain = file_size; + size_t offset = 0; + while (remain > 0) { + size_t read_length = std::min(remain, static_cast(1024 * 1024)); + ICEBERG_ARROW_ASSIGN_OR_RETURN( + auto read_bytes, + file->Read(read_length, reinterpret_cast(&content[offset]))); + remain -= read_bytes; + offset += read_bytes; + } + + return content; + } + + Status WriteFile(const std::string& file_location, + std::string_view content) override { + std::string path; + ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenOutputStream(path)); + ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size())); + ICEBERG_ARROW_RETURN_NOT_OK(file->Flush()); + ICEBERG_ARROW_RETURN_NOT_OK(file->Close()); + return {}; + } + + Status DeleteFile(const std::string& file_location) override { + std::string path; + ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); + ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFile(path)); + return {}; + } +}; + +} // namespace + +Result> MakeS3FileIO( + const std::string& uri, + const std::unordered_map& properties) { + if (!IsS3Uri(uri)) { + return InvalidArgument("S3 URI must start with s3://"); + } +#if !ICEBERG_ARROW_HAS_S3 + return NotImplemented("Arrow S3 support is not enabled"); +#else + // If properties are empty, use the simple URI-based resolution + if (properties.empty()) { + // Validate that S3 can be initialized and the URI is valid + std::string path; + ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(uri, &path)); + (void)path; + (void)fs; + return std::make_unique(); + } + + // Create S3FileSystem with explicit configuration + auto options = ConfigureS3Options(properties); + ICEBERG_ASSIGN_OR_RAISE(auto fs, MakeS3FileSystem(options)); + + // Return ArrowFileSystemFileIO with the configured S3 filesystem + return std::make_unique(std::move(fs)); +#endif +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 40e112db7..f5785a122 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -28,6 +28,7 @@ #include "iceberg/catalog/rest/auth/auth_managers.h" #include "iceberg/catalog/rest/catalog_properties.h" +#include "iceberg/file_io_registry.h" #include "iceberg/catalog/rest/constant.h" #include "iceberg/catalog/rest/endpoint.h" #include "iceberg/catalog/rest/error_handlers.h" @@ -174,6 +175,40 @@ Result> RestCatalog::Make( std::move(catalog_session), snapshot_mode)); } +Result> RestCatalog::Make( + const RestCatalogProperties& config) { + // Get warehouse location to determine the appropriate FileIO type + auto warehouse = config.Get(RestCatalogProperties::kWarehouse); + if (warehouse.empty()) { + return InvalidArgument( + "Warehouse location is required when FileIO is not explicitly provided. " + "Set the 'warehouse' property to an S3 URI (s3://...) or local path."); + } + + // Check for user-specified io-impl property + auto io_impl = config.configs().find(FileIOProperties::kImpl); + std::string impl_name; + + if (io_impl != config.configs().end() && !io_impl->second.empty()) { + // User specified a custom io-impl + impl_name = io_impl->second; + } else { + // Use default based on warehouse URI scheme + if (warehouse.rfind("s3://", 0) == 0) { + impl_name = FileIORegistry::kArrowS3FileIO; + } else { + impl_name = FileIORegistry::kArrowLocalFileIO; + } + } + + // Load FileIO from registry + ICEBERG_ASSIGN_OR_RAISE(auto file_io, + FileIORegistry::Load(impl_name, warehouse, config.configs())); + + // Call the main Make method with the created FileIO + return Make(config, std::move(file_io)); +} + RestCatalog::RestCatalog(RestCatalogProperties config, std::shared_ptr file_io, std::unique_ptr client, std::unique_ptr paths, diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 38230a5e2..63112e8fc 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -54,6 +54,30 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, static Result> Make(const RestCatalogProperties& config, std::shared_ptr file_io); + /// \brief Create a RestCatalog instance with auto-detected FileIO. + /// + /// This overload automatically creates an appropriate FileIO based on the "io-impl" + /// property or the warehouse location URI scheme. + /// + /// FileIO selection logic: + /// 1. If "io-impl" property is set, use the specified implementation from FileIORegistry. + /// 2. Otherwise, auto-detect based on warehouse URI: + /// - "s3://" -> ArrowS3FileIO + /// - Local path -> ArrowLocalFileIO + /// + /// Users can register custom FileIO implementations via FileIORegistry::Register(): + /// \code + /// FileIORegistry::Register("com.mycompany.MyFileIO", + /// [](const std::string& warehouse, const auto& props) { + /// return std::make_shared(warehouse, props); + /// }); + /// \endcode + /// + /// \param config the configuration for the RestCatalog, including warehouse location + /// and optional "io-impl" property + /// \return a shared_ptr to RestCatalog instance, or an error if FileIO creation fails + static Result> Make(const RestCatalogProperties& config); + std::string_view name() const override; Result> ListNamespaces(const Namespace& ns) const override; diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..9e6188438 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -113,6 +113,7 @@ add_iceberg_test(util_test data_file_set_test.cc decimal_test.cc endian_test.cc + file_io_registry_test.cc formatter_test.cc location_util_test.cc string_util_test.cc @@ -137,6 +138,7 @@ if(ICEBERG_BUILD_BUNDLE) USE_BUNDLE SOURCES arrow_fs_file_io_test.cc + arrow_s3_file_io_test.cc arrow_test.cc gzip_decompress_test.cc metadata_io_test.cc diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc new file mode 100644 index 000000000..735e12067 --- /dev/null +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#if __has_include() +#include +#endif +#include + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/s3_properties.h" +#include "iceberg/test/matchers.h" + +namespace iceberg::arrow { + +#if __has_include() +namespace { +class ArrowS3Environment final : public ::testing::Environment { + public: + void TearDown() override { (void)::arrow::fs::FinalizeS3(); } +}; +} // namespace +#endif + +TEST(ArrowS3FileIOTest, RejectsNonS3Uri) { + auto result = MakeS3FileIO("file:///tmp/not-s3"); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("s3://")); +} + +#if __has_include() +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO("s3://bucket/path"); + if (!result.has_value()) { + EXPECT_NE(result.error().kind, ErrorKind::kNotImplemented); + } +} +#else +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO("s3://warehouse/iceberg_example"); + EXPECT_THAT(result, IsError(ErrorKind::kNotImplemented)); +} +#endif + +TEST(ArrowS3FileIOTest, ReadWriteFile) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + auto io_res = MakeS3FileIO(base_uri); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_test.txt"; + auto write_res = io->WriteFile(object_uri, "hello s3"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +// ============================================================================ +// Tests for MakeS3FileIO with properties +// ============================================================================ + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithPropertiesRejectsNonS3Uri) { + std::unordered_map properties; + auto result = MakeS3FileIO("file:///tmp/not-s3", properties); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("s3://")); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithEmptyPropertiesFallsBack) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + // Empty properties should fall back to URI-based resolution + std::unordered_map properties; + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + EXPECT_NE(io_res.value(), nullptr); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithProperties) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + const char* access_key = std::getenv("AWS_ACCESS_KEY_ID"); + const char* secret_key = std::getenv("AWS_SECRET_ACCESS_KEY"); + const char* endpoint = std::getenv("ICEBERG_TEST_S3_ENDPOINT"); + const char* region = std::getenv("AWS_REGION"); + + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + + // Configure credentials if available + if (access_key != nullptr && secret_key != nullptr) { + properties[S3Properties::kAccessKeyId] = access_key; + properties[S3Properties::kSecretAccessKey] = secret_key; + } + + // Configure endpoint if available (for MinIO, LocalStack, etc.) + if (endpoint != nullptr && std::string(endpoint).length() > 0) { + properties[S3Properties::kEndpoint] = endpoint; + } + + // Configure region if available + if (region != nullptr && std::string(region).length() > 0) { + properties[S3Properties::kRegion] = region; + } + + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_props_test.txt"; + + auto write_res = io->WriteFile(object_uri, "hello s3 with properties"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3 with properties"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithSslDisabled) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[S3Properties::kSslEnabled] = "false"; + + // Just test that the configuration is accepted + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[S3Properties::kConnectTimeoutMs] = "5000"; + properties[S3Properties::kSocketTimeoutMs] = "10000"; + + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +} // namespace iceberg::arrow + +#if __has_include() +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::testing::AddGlobalTestEnvironment(new iceberg::arrow::ArrowS3Environment()); + return RUN_ALL_TESTS(); +} +#endif diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index b364ffd36..133d5b86f 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -39,6 +39,7 @@ #include "iceberg/catalog/rest/http_client.h" #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/rest_catalog.h" +#include "iceberg/file_io_registry.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -476,4 +477,78 @@ TEST_F(RestCatalogIntegrationTest, LoadTableWithSnapshotModeRefs) { EXPECT_FALSE(loaded->metadata()->schemas.empty()); } +// ============================================================================ +// Tests for RestCatalog::Make(config) with auto-detected FileIO +// ============================================================================ + +TEST_F(RestCatalogIntegrationTest, MakeWithoutWarehouseReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)); + // Note: warehouse is NOT set + + auto result = RestCatalog::Make(config); + + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Warehouse location is required")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithUnregisteredIoImplReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, "/local/warehouse"); + config.mutable_configs()[FileIOProperties::kImpl] = "com.nonexistent.FileIO"; + + auto result = RestCatalog::Make(config); + + // Should fail because the io-impl is not registered + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithAutoDetectedLocalFileIO) { + FileIORegistry::Register( + FileIORegistry::kArrowLocalFileIO, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::make_shared(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, "/local/warehouse"); + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithCustomIoImpl) { + const std::string custom_impl = "com.mycompany.CustomFileIO"; + FileIORegistry::Register( + custom_impl, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::make_shared(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, "/any/warehouse"); + config.mutable_configs()[FileIOProperties::kImpl] = custom_impl; + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + } // namespace iceberg::rest From 3cdf39a4ffe681fac5956b406bc386ca3910e201 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Wed, 25 Mar 2026 20:08:42 +0800 Subject: [PATCH 2/2] fix --- CMakeLists.txt | 1 + ci/scripts/start_minio.sh | 29 +++- .../IcebergThirdpartyToolchain.cmake | 9 +- src/iceberg/CMakeLists.txt | 9 ++ src/iceberg/arrow/arrow_fs_file_io.cc | 22 ++- src/iceberg/arrow/arrow_fs_file_io_internal.h | 3 + src/iceberg/arrow/arrow_s3_file_io.cc | 146 +++++------------- src/iceberg/arrow/file_io_register.cc | 53 +++++++ src/iceberg/arrow/file_io_register.h | 40 +++++ src/iceberg/arrow/s3_properties.h | 52 +++++++ src/iceberg/catalog/rest/rest_catalog.cc | 4 +- src/iceberg/catalog/rest/rest_catalog.h | 3 +- src/iceberg/file_io_registry.cc | 21 +++ src/iceberg/file_io_registry.h | 102 ++++++++++++ src/iceberg/meson.build | 2 + src/iceberg/test/arrow_s3_file_io_test.cc | 46 ++++-- src/iceberg/test/file_io_registry_test.cc | 119 ++++++++++++++ .../test/rest_catalog_integration_test.cc | 6 +- 18 files changed, 528 insertions(+), 139 deletions(-) create mode 100644 src/iceberg/arrow/file_io_register.cc create mode 100644 src/iceberg/arrow/file_io_register.h create mode 100644 src/iceberg/arrow/s3_properties.h create mode 100644 src/iceberg/file_io_registry.cc create mode 100644 src/iceberg/file_io_registry.h create mode 100644 src/iceberg/test/file_io_registry_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index e7281fb11..8647a3c64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON) option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON) option(ICEBERG_BUILD_REST "Build rest catalog client" ON) option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF) +option(ICEBERG_S3 "Build with S3 support" ON) option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF) option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF) diff --git a/ci/scripts/start_minio.sh b/ci/scripts/start_minio.sh index e3f416509..219990d3f 100644 --- a/ci/scripts/start_minio.sh +++ b/ci/scripts/start_minio.sh @@ -21,7 +21,7 @@ set -eux MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}" -MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:RELEASE.2024-12-18T00-00-00Z}" +MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}" MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}" MINIO_PORT="${MINIO_PORT:-9000}" MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}" @@ -64,7 +64,8 @@ start_minio_macos() { fi brew install minio - minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" & + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" & wait_for_minio } @@ -84,7 +85,13 @@ download_mc() { ;; Darwin*) MC_BIN="${mc_dir}/mc" - curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}" + local arch + arch="$(uname -m)" + if [ "${arch}" = "arm64" ]; then + curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}" + else + curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}" + fi chmod +x "${MC_BIN}" ;; MINGW*|MSYS*|CYGWIN*) @@ -109,13 +116,27 @@ create_bucket() { "${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}" } +start_minio_windows() { + local minio_dir="${RUNNER_TEMP:-/tmp}" + local minio_bin="${minio_dir}/minio.exe" + curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}" + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + "${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + case "$(uname -s)" in Darwin*) if ! start_minio_docker; then start_minio_macos fi ;; - Linux*|MINGW*|MSYS*|CYGWIN*) + MINGW*|MSYS*|CYGWIN*) + if ! start_minio_docker; then + start_minio_windows + fi + ;; + Linux*) start_minio_docker ;; *) diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 6a6d5ea85..d4f837d67 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -102,7 +102,7 @@ function(resolve_arrow_dependency) # Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*) set(ARROW_IPC ON) set(ARROW_FILESYSTEM ON) - set(ARROW_S3 ON) + set(ARROW_S3 ${ICEBERG_S3}) set(ARROW_JSON ON) set(ARROW_PARQUET ON) set(ARROW_SIMD_LEVEL "NONE") @@ -165,6 +165,13 @@ function(resolve_arrow_dependency) install(FILES ${arrow_bundled_dependencies_location} DESTINATION ${ICEBERG_INSTALL_LIBDIR}) endif() + + # Arrow's exported static target interface may reference system libraries + # (e.g. OpenSSL, CURL, ZLIB) that consumers need to find. + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB) + if(ARROW_S3) + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL) + endif() else() set(ARROW_VENDORED FALSE) find_package(Arrow CONFIG REQUIRED) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index e74b11586..0af6bbce3 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -244,6 +244,15 @@ if(ICEBERG_BUILD_BUNDLE) OUTPUTS ICEBERG_BUNDLE_LIBRARIES) + if(ICEBERG_S3) + foreach(target iceberg_bundle_static iceberg_bundle_shared) + if(TARGET ${target}) + target_compile_definitions(${target} + PUBLIC "$") + endif() + endforeach() + endif() + add_subdirectory(arrow) add_subdirectory(avro) add_subdirectory(parquet) diff --git a/src/iceberg/arrow/arrow_fs_file_io.cc b/src/iceberg/arrow/arrow_fs_file_io.cc index be62b79af..769fcfb13 100644 --- a/src/iceberg/arrow/arrow_fs_file_io.cc +++ b/src/iceberg/arrow/arrow_fs_file_io.cc @@ -25,13 +25,23 @@ #include "iceberg/arrow/arrow_file_io.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/util/macros.h" namespace iceberg::arrow { +Result ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) { + if (file_location.find("://") != std::string::npos) { + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto path, arrow_fs_->PathFromUri(file_location)); + return path; + } + return file_location; +} + /// \brief Read the content of the file at the given location. Result ArrowFileSystemFileIO::ReadFile(const std::string& file_location, std::optional length) { - ::arrow::fs::FileInfo file_info(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ::arrow::fs::FileInfo file_info(path); if (length.has_value()) { file_info.set_size(length.value()); } @@ -47,6 +57,10 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca ICEBERG_ARROW_ASSIGN_OR_RETURN( auto read_bytes, file->Read(read_length, reinterpret_cast(&content[offset]))); + if (read_bytes == 0) { + return IOError("Unexpected EOF reading {}: got {} of {} bytes", file_location, + offset, file_size); + } remain -= read_bytes; offset += read_bytes; } @@ -57,7 +71,8 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca /// \brief Write the given content to the file at the given location. Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, std::string_view content) { - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(path)); ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size())); ICEBERG_ARROW_RETURN_NOT_OK(file->Flush()); ICEBERG_ARROW_RETURN_NOT_OK(file->Close()); @@ -66,7 +81,8 @@ Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, /// \brief Delete a file at the given location. Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) { - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path)); return {}; } diff --git a/src/iceberg/arrow/arrow_fs_file_io_internal.h b/src/iceberg/arrow/arrow_fs_file_io_internal.h index f151c7a5b..92a991501 100644 --- a/src/iceberg/arrow/arrow_fs_file_io_internal.h +++ b/src/iceberg/arrow/arrow_fs_file_io_internal.h @@ -56,6 +56,9 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; } private: + /// \brief Resolve a file location to a filesystem path. + Result ResolvePath(const std::string& file_location); + std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_; }; diff --git a/src/iceberg/arrow/arrow_s3_file_io.cc b/src/iceberg/arrow/arrow_s3_file_io.cc index 9090d7b35..e610d15eb 100644 --- a/src/iceberg/arrow/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/arrow_s3_file_io.cc @@ -19,15 +19,14 @@ #include #include -#include +#include #include -#include -#if __has_include() -#include -#define ICEBERG_ARROW_HAS_S3 1 +#ifdef ICEBERG_S3_ENABLED +# include +# define ICEBERG_ARROW_HAS_S3 1 #else -#define ICEBERG_ARROW_HAS_S3 0 +# define ICEBERG_ARROW_HAS_S3 0 #endif #include "iceberg/arrow/arrow_file_io.h" @@ -40,8 +39,6 @@ namespace iceberg::arrow { namespace { -bool IsS3Uri(std::string_view uri) { return uri.rfind("s3://", 0) == 0; } - Status EnsureS3Initialized() { #if ICEBERG_ARROW_HAS_S3 static std::once_flag init_flag; @@ -49,14 +46,10 @@ Status EnsureS3Initialized() { std::call_once(init_flag, []() { ::arrow::fs::S3GlobalOptions options; init_status = ::arrow::fs::InitializeS3(options); - if (init_status.ok()) { - std::atexit([]() { (void)::arrow::fs::FinalizeS3(); }); - } }); if (!init_status.ok()) { - return std::unexpected{ - {.kind = ::iceberg::arrow::ToErrorKind(init_status), - .message = init_status.ToString()}}; + return std::unexpected(Error{.kind = ::iceberg::arrow::ToErrorKind(init_status), + .message = init_status.ToString()}); } return {}; #else @@ -69,7 +62,7 @@ Status EnsureS3Initialized() { /// /// \param properties The configuration properties map. /// \return Configured S3Options. -::arrow::fs::S3Options ConfigureS3Options( +Result<::arrow::fs::S3Options> ConfigureS3Options( const std::unordered_map& properties) { ::arrow::fs::S3Options options; @@ -100,13 +93,22 @@ ::arrow::fs::S3Options ConfigureS3Options( auto endpoint_it = properties.find(S3Properties::kEndpoint); if (endpoint_it != properties.end()) { options.endpoint_override = endpoint_it->second; + } else { + // Fall back to AWS standard environment variables for endpoint override + const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); + if (s3_endpoint_env != nullptr) { + options.endpoint_override = s3_endpoint_env; + } else { + const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL"); + if (endpoint_env != nullptr) { + options.endpoint_override = endpoint_env; + } + } } - // Configure path-style access (needed for MinIO) auto path_style_it = properties.find(S3Properties::kPathStyleAccess); - if (path_style_it != properties.end()) { - // Arrow's S3 path-style is controlled via endpoint scheme - // For path-style access, we need to ensure the endpoint is properly configured + if (path_style_it != properties.end() && path_style_it->second == "true") { + options.force_virtual_addressing = false; } // Configure SSL @@ -118,117 +120,45 @@ ::arrow::fs::S3Options ConfigureS3Options( // Configure timeouts auto connect_timeout_it = properties.find(S3Properties::kConnectTimeoutMs); if (connect_timeout_it != properties.end()) { - options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0; + try { + options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0; + } catch (const std::exception& e) { + return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kConnectTimeoutMs, + connect_timeout_it->second, e.what()); + } } auto socket_timeout_it = properties.find(S3Properties::kSocketTimeoutMs); if (socket_timeout_it != properties.end()) { - options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0; + try { + options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0; + } catch (const std::exception& e) { + return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kSocketTimeoutMs, + socket_timeout_it->second, e.what()); + } } return options; } - -/// \brief Create an S3 FileSystem with the given options. -/// -/// \param options The S3Options to use. -/// \return A shared_ptr to the S3FileSystem, or an error. -Result> MakeS3FileSystem( - const ::arrow::fs::S3Options& options) { - ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); - return fs; -} #endif -Result> ResolveFileSystemFromUri( - const std::string& uri, std::string* out_path) { - if (IsS3Uri(uri)) { - ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); - } - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::FileSystemFromUri(uri, out_path)); - return fs; -} - -/// \brief ArrowUriFileIO resolves FileSystem from URI for each operation. -/// -/// This implementation is thread-safe as it creates a new FileSystem instance -/// for each operation. However, it may be less efficient than caching the -/// FileSystem. S3 initialization is done once per process. -class ArrowUriFileIO : public FileIO { - public: - Result ReadFile(const std::string& file_location, - std::optional length) override { - std::string path; - ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); - ::arrow::fs::FileInfo file_info(path); - if (length.has_value()) { - file_info.set_size(length.value()); - } - std::string content; - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenInputFile(file_info)); - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file_size, file->GetSize()); - - content.resize(file_size); - size_t remain = file_size; - size_t offset = 0; - while (remain > 0) { - size_t read_length = std::min(remain, static_cast(1024 * 1024)); - ICEBERG_ARROW_ASSIGN_OR_RETURN( - auto read_bytes, - file->Read(read_length, reinterpret_cast(&content[offset]))); - remain -= read_bytes; - offset += read_bytes; - } - - return content; - } - - Status WriteFile(const std::string& file_location, - std::string_view content) override { - std::string path; - ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, fs->OpenOutputStream(path)); - ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size())); - ICEBERG_ARROW_RETURN_NOT_OK(file->Flush()); - ICEBERG_ARROW_RETURN_NOT_OK(file->Close()); - return {}; - } - - Status DeleteFile(const std::string& file_location) override { - std::string path; - ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(file_location, &path)); - ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFile(path)); - return {}; - } -}; - } // namespace Result> MakeS3FileIO( const std::string& uri, const std::unordered_map& properties) { - if (!IsS3Uri(uri)) { + if (!uri.starts_with("s3://")) { return InvalidArgument("S3 URI must start with s3://"); } #if !ICEBERG_ARROW_HAS_S3 return NotImplemented("Arrow S3 support is not enabled"); #else - // If properties are empty, use the simple URI-based resolution - if (properties.empty()) { - // Validate that S3 can be initialized and the URI is valid - std::string path; - ICEBERG_ASSIGN_OR_RAISE(auto fs, ResolveFileSystemFromUri(uri, &path)); - (void)path; - (void)fs; - return std::make_unique(); - } + ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); - // Create S3FileSystem with explicit configuration - auto options = ConfigureS3Options(properties); - ICEBERG_ASSIGN_OR_RAISE(auto fs, MakeS3FileSystem(options)); + // Configure S3 options from properties (uses default credentials if empty) + ICEBERG_ASSIGN_OR_RAISE(auto options, ConfigureS3Options(properties)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); - // Return ArrowFileSystemFileIO with the configured S3 filesystem return std::make_unique(std::move(fs)); #endif } diff --git a/src/iceberg/arrow/file_io_register.cc b/src/iceberg/arrow/file_io_register.cc new file mode 100644 index 000000000..8438a128c --- /dev/null +++ b/src/iceberg/arrow/file_io_register.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/arrow/file_io_register.h" + +#include + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/file_io_registry.h" +#include "iceberg/util/macros.h" + +namespace iceberg::arrow { + +void RegisterFileIO() { + static std::once_flag flag; + std::call_once(flag, []() { + // Register Arrow local filesystem FileIO + FileIORegistry::Register( + FileIORegistry::kArrowLocalFileIO, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::shared_ptr(MakeLocalFileIO()); + }); + + // Register Arrow S3 FileIO + FileIORegistry::Register( + FileIORegistry::kArrowS3FileIO, + [](const std::string& warehouse, + const std::unordered_map& properties) + -> Result> { + ICEBERG_ASSIGN_OR_RAISE(auto file_io, MakeS3FileIO(warehouse, properties)); + return std::shared_ptr(std::move(file_io)); + }); + }); +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/file_io_register.h b/src/iceberg/arrow/file_io_register.h new file mode 100644 index 000000000..6f52c2f81 --- /dev/null +++ b/src/iceberg/arrow/file_io_register.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/arrow/file_io_register.h +/// \brief Provide functions to register Arrow FileIO implementations. + +#include "iceberg/iceberg_bundle_export.h" + +namespace iceberg::arrow { + +/// \brief Register Arrow FileIO implementations (local and S3) into the +/// FileIORegistry. +/// +/// This function is idempotent and thread-safe. It registers: +/// - ArrowFileIO (local filesystem) +/// - ArrowS3FileIO (S3 filesystem) +/// +/// Must be called before using FileIORegistry::Load() with the built-in +/// implementation names (e.g., from RestCatalog::Make(config)). +ICEBERG_BUNDLE_EXPORT void RegisterFileIO(); + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3_properties.h b/src/iceberg/arrow/s3_properties.h new file mode 100644 index 000000000..210a1ab3e --- /dev/null +++ b/src/iceberg/arrow/s3_properties.h @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::arrow { + +/// \brief S3 configuration property keys for ArrowS3FileIO. +/// +/// These constants define the property keys used to configure S3 access +/// via the Arrow filesystem integration, following the Iceberg spec for +/// S3 configuration properties. +struct S3Properties { + /// AWS access key ID + static constexpr const char* kAccessKeyId = "s3.access-key-id"; + /// AWS secret access key + static constexpr const char* kSecretAccessKey = "s3.secret-access-key"; + /// AWS session token (for temporary credentials) + static constexpr const char* kSessionToken = "s3.session-token"; + /// AWS region + static constexpr const char* kRegion = "s3.region"; + /// Custom endpoint override (for MinIO, LocalStack, etc.) + static constexpr const char* kEndpoint = "s3.endpoint"; + /// Whether to use path-style access (needed for MinIO) + static constexpr const char* kPathStyleAccess = "s3.path-style-access"; + /// Whether SSL is enabled + static constexpr const char* kSslEnabled = "s3.ssl.enabled"; + /// Connection timeout in milliseconds + static constexpr const char* kConnectTimeoutMs = "s3.connect-timeout-ms"; + /// Socket timeout in milliseconds + static constexpr const char* kSocketTimeoutMs = "s3.socket-timeout-ms"; +}; + +} // namespace iceberg::arrow diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index f5785a122..6907b3763 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -28,7 +28,6 @@ #include "iceberg/catalog/rest/auth/auth_managers.h" #include "iceberg/catalog/rest/catalog_properties.h" -#include "iceberg/file_io_registry.h" #include "iceberg/catalog/rest/constant.h" #include "iceberg/catalog/rest/endpoint.h" #include "iceberg/catalog/rest/error_handlers.h" @@ -37,6 +36,7 @@ #include "iceberg/catalog/rest/resource_paths.h" #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_io_registry.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -194,7 +194,7 @@ Result> RestCatalog::Make( impl_name = io_impl->second; } else { // Use default based on warehouse URI scheme - if (warehouse.rfind("s3://", 0) == 0) { + if (warehouse.starts_with("s3://")) { impl_name = FileIORegistry::kArrowS3FileIO; } else { impl_name = FileIORegistry::kArrowLocalFileIO; diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 63112e8fc..ce122b3b8 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -60,7 +60,8 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, /// property or the warehouse location URI scheme. /// /// FileIO selection logic: - /// 1. If "io-impl" property is set, use the specified implementation from FileIORegistry. + /// 1. If "io-impl" property is set, use the specified implementation from + /// FileIORegistry. /// 2. Otherwise, auto-detect based on warehouse URI: /// - "s3://" -> ArrowS3FileIO /// - Local path -> ArrowLocalFileIO diff --git a/src/iceberg/file_io_registry.cc b/src/iceberg/file_io_registry.cc new file mode 100644 index 000000000..ebc84820f --- /dev/null +++ b/src/iceberg/file_io_registry.cc @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +// FileIORegistry is header-only (all methods are inline/static). +// This translation unit ensures the header compiles cleanly. diff --git a/src/iceberg/file_io_registry.h b/src/iceberg/file_io_registry.h new file mode 100644 index 000000000..6500b0e93 --- /dev/null +++ b/src/iceberg/file_io_registry.h @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/file_io.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief Registry for FileIO implementations. +/// +/// Provides a mechanism to register and load FileIO implementations by name. +/// This allows the REST catalog (and others) to resolve FileIO implementations +/// at runtime based on configuration properties like "io-impl". +class ICEBERG_EXPORT FileIORegistry { + public: + /// Well-known implementation names + static constexpr const char* kArrowLocalFileIO = "org.apache.iceberg.arrow.ArrowFileIO"; + static constexpr const char* kArrowS3FileIO = "org.apache.iceberg.arrow.ArrowS3FileIO"; + + /// Factory function type for creating FileIO instances. + using Factory = std::function>( + const std::string& warehouse, + const std::unordered_map& properties)>; + + /// \brief Register a FileIO factory under the given name. + /// + /// \param name The implementation name (e.g., "org.apache.iceberg.arrow.ArrowFileIO") + /// \param factory The factory function that creates the FileIO instance. + static void Register(const std::string& name, Factory factory) { + std::lock_guard lock(Mutex()); + Registry()[name] = std::move(factory); + } + + /// \brief Load a FileIO implementation by name. + /// + /// \param name The implementation name to look up. + /// \param warehouse The warehouse location URI. + /// \param properties Configuration properties to pass to the factory. + /// \return A shared_ptr to the FileIO instance, or an error if not found. + static Result> Load( + const std::string& name, const std::string& warehouse, + const std::unordered_map& properties) { + Factory factory; + { + std::lock_guard lock(Mutex()); + auto it = Registry().find(name); + if (it == Registry().end()) { + return std::unexpected( + {.kind = ErrorKind::kNotFound, + .message = "FileIO implementation not found: " + name}); + } + factory = it->second; + } + // Invoke factory outside the lock to avoid blocking other Register/Load + // calls and to prevent deadlocks if the factory calls back into the registry. + return factory(warehouse, properties); + } + + private: + static std::unordered_map& Registry() { + static std::unordered_map registry; + return registry; + } + + static std::mutex& Mutex() { + static std::mutex mutex; + return mutex; + } +}; + +/// \brief Property keys for FileIO configuration. +struct FileIOProperties { + /// The FileIO implementation class name (e.g., "org.apache.iceberg.arrow.ArrowFileIO") + static constexpr const char* kImpl = "io-impl"; +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 2cf1065b0..b13f6c4c9 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -58,6 +58,7 @@ iceberg_sources = files( 'expression/rewrite_not.cc', 'expression/strict_metrics_evaluator.cc', 'expression/term.cc', + 'file_io_registry.cc', 'file_reader.cc', 'file_writer.cc', 'inheritable_metadata.cc', @@ -185,6 +186,7 @@ install_headers( 'exception.h', 'file_format.h', 'file_io.h', + 'file_io_registry.h', 'file_reader.h', 'file_writer.h', 'iceberg_export.h', diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc index 735e12067..f44730f04 100644 --- a/src/iceberg/test/arrow_s3_file_io_test.cc +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -18,36 +18,56 @@ */ #include +#include #include #include -#if __has_include() -#include -#endif #include +#ifdef ICEBERG_S3_ENABLED +# include +#endif + #include "iceberg/arrow/arrow_file_io.h" #include "iceberg/arrow/s3_properties.h" #include "iceberg/test/matchers.h" -namespace iceberg::arrow { - -#if __has_include() +#ifdef ICEBERG_S3_ENABLED namespace { -class ArrowS3Environment final : public ::testing::Environment { + +/// \brief GTest environment that finalizes Arrow S3 after all tests complete. +/// +/// Arrow's S3 initialization creates global state that must be cleaned up via +/// FinalizeS3() before the process exits. Without this, Arrow's static destructor +/// detects the missing finalization and causes a non-zero exit (which fails under +/// sanitizers). GTest Environment::TearDown() runs after all tests but before +/// static destructors, making it the safe place to finalize. +class ArrowS3TestEnvironment : public ::testing::Environment { public: - void TearDown() override { (void)::arrow::fs::FinalizeS3(); } + void TearDown() override { + auto status = ::arrow::fs::FinalizeS3(); + if (!status.ok()) { + std::cerr << "Warning: FinalizeS3 failed: " << status.ToString() << std::endl; + } + } }; + +// Register before main() runs. GTest takes ownership of the pointer. +[[maybe_unused]] auto* const kS3Env = + ::testing::AddGlobalTestEnvironment(new ArrowS3TestEnvironment); + } // namespace #endif +namespace iceberg::arrow { + TEST(ArrowS3FileIOTest, RejectsNonS3Uri) { auto result = MakeS3FileIO("file:///tmp/not-s3"); EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); EXPECT_THAT(result, HasErrorMessage("s3://")); } -#if __has_include() +#ifdef ICEBERG_S3_ENABLED TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { auto result = MakeS3FileIO("s3://bucket/path"); if (!result.has_value()) { @@ -216,11 +236,3 @@ TEST(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { } } // namespace iceberg::arrow - -#if __has_include() -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - ::testing::AddGlobalTestEnvironment(new iceberg::arrow::ArrowS3Environment()); - return RUN_ALL_TESTS(); -} -#endif diff --git a/src/iceberg/test/file_io_registry_test.cc b/src/iceberg/test/file_io_registry_test.cc new file mode 100644 index 000000000..d927488a6 --- /dev/null +++ b/src/iceberg/test/file_io_registry_test.cc @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +#include +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg { + +namespace { + +/// A minimal FileIO implementation for testing. +class MockFileIO : public FileIO { + public: + Result ReadFile(const std::string& /*file_location*/, + std::optional /*length*/) override { + return std::string("mock"); + } + + Status WriteFile(const std::string& /*file_location*/, + std::string_view /*content*/) override { + return {}; + } + + Status DeleteFile(const std::string& /*file_location*/) override { return {}; } +}; + +} // namespace + +TEST(FileIoRegistryTest, RegisterAndLoad) { + const std::string impl_name = "com.test.MockFileIO"; + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + auto result = FileIORegistry::Load(impl_name, "/test/warehouse", {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); + + // Verify the loaded FileIO works + auto read_result = result.value()->ReadFile("any_file", std::nullopt); + ASSERT_THAT(read_result, IsOk()); + EXPECT_EQ(read_result.value(), "mock"); +} + +TEST(FileIoRegistryTest, LoadNonExistentReturnsError) { + auto result = FileIORegistry::Load("com.nonexistent.FileIO", "/test/warehouse", {}); + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST(FileIoRegistryTest, OverrideExistingRegistration) { + const std::string impl_name = "com.test.OverrideFileIO"; + + // Register first implementation + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + // Override with a different factory + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + // Should still work (the override replaces the original) + auto result = FileIORegistry::Load(impl_name, "/test/warehouse", {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); +} + +TEST(FileIoRegistryTest, FactoryReceivesWarehouseAndProperties) { + const std::string impl_name = "com.test.PropCheckFileIO"; + std::string captured_warehouse; + std::unordered_map captured_properties; + + FileIORegistry::Register( + impl_name, + [&captured_warehouse, &captured_properties]( + const std::string& warehouse, + const std::unordered_map& properties) + -> Result> { + captured_warehouse = warehouse; + captured_properties = properties; + return std::make_shared(); + }); + + std::unordered_map props = {{"key1", "val1"}, + {"key2", "val2"}}; + auto result = FileIORegistry::Load(impl_name, "s3://my-bucket/warehouse", props); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(captured_warehouse, "s3://my-bucket/warehouse"); + EXPECT_EQ(captured_properties.at("key1"), "val1"); + EXPECT_EQ(captured_properties.at("key2"), "val2"); +} + +} // namespace iceberg diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index 133d5b86f..efde1dc15 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -497,7 +497,7 @@ TEST_F(RestCatalogIntegrationTest, MakeWithUnregisteredIoImplReturnsError) { auto config = RestCatalogProperties::default_properties(); config.Set(RestCatalogProperties::kUri, CatalogUri()) .Set(RestCatalogProperties::kName, std::string(kCatalogName)) - .Set(RestCatalogProperties::kWarehouse, "/local/warehouse"); + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); config.mutable_configs()[FileIOProperties::kImpl] = "com.nonexistent.FileIO"; auto result = RestCatalog::Make(config); @@ -519,7 +519,7 @@ TEST_F(RestCatalogIntegrationTest, MakeWithAutoDetectedLocalFileIO) { auto config = RestCatalogProperties::default_properties(); config.Set(RestCatalogProperties::kUri, CatalogUri()) .Set(RestCatalogProperties::kName, std::string(kCatalogName)) - .Set(RestCatalogProperties::kWarehouse, "/local/warehouse"); + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); auto catalog_result = RestCatalog::Make(config); ASSERT_THAT(catalog_result, IsOk()); @@ -541,7 +541,7 @@ TEST_F(RestCatalogIntegrationTest, MakeWithCustomIoImpl) { auto config = RestCatalogProperties::default_properties(); config.Set(RestCatalogProperties::kUri, CatalogUri()) .Set(RestCatalogProperties::kName, std::string(kCatalogName)) - .Set(RestCatalogProperties::kWarehouse, "/any/warehouse"); + .Set(RestCatalogProperties::kWarehouse, std::string("/any/warehouse")); config.mutable_configs()[FileIOProperties::kImpl] = custom_impl; auto catalog_result = RestCatalog::Make(config);