IMPALA-13586: Initial support for Iceberg REST Catalogs

This patch adds initial support for Iceberg REST Catalogs. This means
now it's possible to run an Impala cluster without the Hive Metastore,
and without the Impala CatalogD. Impala Coordinators can directly
connect to an Iceberg REST server and fetch metadata for databases and
tables from there. The support is read-only, i.e. DDL and DML statements
are not supported yet.

This was initially developed in the context of a company Hackathon
program, i.e. it was a team effort that I squashed into a single commit
and polished the code a bit.

The Hackathon team members were:
* Daniel Becker
* Gabor Kaszab
* Kurt Deschler
* Peter Rozsa
* Zoltan Borok-Nagy

The Iceberg REST Catalog support can be configured via a Java properties
file, the location of it can be specified via:
 --catalog_config_dir: Directory of configuration files

Currently only one configuration file can be in the direcory as we only
support a single Catalog at a time. The following properties are mandatory
in the config file:
* connector.name=iceberg
* iceberg.catalog.type=rest
* iceberg.rest-catalog.uri

The first two properties can only be 'iceberg' and 'rest' for now, they
are needed for extensibility in the future.

Moreover, Impala Daemons need to specify the following flags to connect
to an Iceberg REST Catalog:
 --use_local_catalog=true
 --catalogd_deployed=false

Testing
* e2e added to test basic functionlity with against a custom-built
  Iceberg REST server that delegates to HadoopCatalog under the hood
* Further testing, e.g. Ranger tests are expected in subsequent
  commits

TODO:
* manual testing against Polaris / Lakekeeper, we could add automated
  tests in a later patch

Change-Id: I1722b898b568d2f5689002f2b9bef59320cb088c
Reviewed-on: http://gerrit.cloudera.org:8080/22353
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Zoltan Borok-Nagy
2024-12-20 14:48:11 +01:00
committed by Impala Public Jenkins
parent 99fc96adea
commit bd3486c051
34 changed files with 1508 additions and 28 deletions

View File

@@ -0,0 +1,107 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<groupId>org.apache.impala</groupId>
<artifactId>impala-parent</artifactId>
<version>5.0.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>impala-iceberg-rest-catalog-test</artifactId>
<packaging>jar</packaging>
<name>Iceberg REST Catalog Test</name>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<!-- IMPALA-9468: Avoid pulling in netty for security reasons -->
<exclusion>
<groupId>io.netty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-server</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-servlet</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-api</artifactId>
<version>${iceberg.version}</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-core</artifactId>
<version>${iceberg.version}</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-core</artifactId>
<version>${iceberg.version}</version>
<classifier>tests</classifier>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,136 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
// We use the org.apache.iceberg.rest package because some classes
// are package-private. This means this code is more likely to
// break on Iceberg version updates. On the long-term we might
// switch to an open-source Iceberg REST Catalog.
package org.apache.iceberg.rest;
import java.io.IOException;
import java.util.Map;
import java.util.function.Consumer;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.iceberg.hadoop.HadoopCatalog;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.rest.responses.ErrorResponse;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.handler.gzip.GzipHandler;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IcebergRestCatalogTest {
private static final Logger LOG = LoggerFactory.getLogger(IcebergRestCatalogTest.class);
private static final ObjectMapper MAPPER = RESTObjectMapper.mapper();
static final int REST_PORT = 9084;
private Server httpServer;
public IcebergRestCatalogTest() {}
private static String getWarehouseLocation() {
String FILESYSTEM_PREFIX = System.getenv("FILESYSTEM_PREFIX");
String HADOOP_CATALOG_LOCATION = "/test-warehouse/iceberg_test/hadoop_catalog";
if (FILESYSTEM_PREFIX != null && !FILESYSTEM_PREFIX.isEmpty()) {
return FILESYSTEM_PREFIX + HADOOP_CATALOG_LOCATION;
}
String DEFAULT_FS = System.getenv("DEFAULT_FS");
return DEFAULT_FS + HADOOP_CATALOG_LOCATION;
}
private Catalog initializeBackendCatalog() throws IOException {
HdfsConfiguration conf = new HdfsConfiguration();
return new HadoopCatalog(conf, getWarehouseLocation());
}
public void start(boolean join) throws Exception {
Catalog catalog = initializeBackendCatalog();
RESTCatalogAdapter adapter = new RESTCatalogAdapter(catalog) {
@Override
public <T extends RESTResponse> T execute(
RESTCatalogAdapter.HTTPMethod method,
String path,
Map<String, String> queryParams,
Object body,
Class<T> responseType,
Map<String, String> headers,
Consumer<ErrorResponse> errorHandler) {
Object request = roundTripSerialize(body, "request");
T response =
super.execute(
method, path, queryParams, request, responseType, headers, errorHandler);
T responseAfterSerialization = roundTripSerialize(response, "response");
return responseAfterSerialization;
}
};
RESTCatalogServlet servlet = new RESTCatalogServlet(adapter);
ServletContextHandler context = new ServletContextHandler(
ServletContextHandler.NO_SESSIONS);
ServletHolder servletHolder = new ServletHolder(servlet);
context.addServlet(servletHolder, "/*");
context.insertHandler(new GzipHandler());
this.httpServer = new Server(REST_PORT);
httpServer.setHandler(context);
httpServer.start();
if (join) {
httpServer.join();
}
}
public void stop() throws Exception {
if (httpServer != null) {
httpServer.stop();
}
}
public static void main(String[] args) throws Exception {
new IcebergRestCatalogTest().start(true);
}
public static <T> T roundTripSerialize(T payload, String description) {
if (payload != null) {
LOG.trace(payload.toString());
try {
if (payload instanceof RESTMessage) {
return (T) MAPPER.readValue(
MAPPER.writeValueAsString(payload), payload.getClass());
} else {
// use Map so that Jackson doesn't try to instantiate ImmutableMap
// from payload.getClass()
return (T) MAPPER.readValue(
MAPPER.writeValueAsString(payload), Map.class);
}
} catch (Exception e) {
LOG.warn(e.toString());
throw new RuntimeException(
String.format("Failed to serialize and deserialize %s: %s",
description, payload), e);
}
}
return null;
}
}

View File

@@ -408,6 +408,7 @@ under the License.
<modules>
<module>datagenerator</module>
<module>puffin-data-generator</module>
<module>iceberg-rest-catalog-test</module>
<module>executor-deps</module>
<module>ext-data-source</module>
<module>../fe</module>