1
0
mirror of synced 2025-12-19 10:00:34 -05:00

Fix build: Revert "chore: clean out unused "bases" and utils (#53234)" (#53621)

This commit is contained in:
Edward Gao
2025-02-10 13:36:30 -08:00
committed by GitHub
parent b51d9a4043
commit c8e3ec0210
679 changed files with 41745 additions and 0 deletions

4
.github/labeler.yml vendored
View File

@@ -14,3 +14,7 @@ area/documentation:
CDK:
- airbyte-cdk/*
- airbyte-cdk/**/*
normalization:
- airbyte-integrations/bases/base-normalization/*
- airbyte-integrations/bases/base-normalization/**/*

View File

@@ -6,6 +6,8 @@ exclude: |
^.*?/node_modules/.*$|
^.*?/charts/.*$|
^airbyte-integrations/bases/base-normalization/.*$|
^.*?/normalization_test_output/.*$|
^.*?/pnpm-lock\.yaml$|
^.*?/source-amplitude/unit_tests/api_data/zipped\.json$|

View File

@@ -0,0 +1,5 @@
*
!Dockerfile
!build
!javabase.sh
!run_with_normalization.sh

View File

@@ -0,0 +1,34 @@
### WARNING ###
# The Java connector Dockerfiles will soon be deprecated.
# This Dockerfile is not used to build the connector image we publish to DockerHub.
# The new logic to build the connector image is declared with Dagger here:
# https://github.com/airbytehq/airbyte/blob/master/tools/ci_connector_ops/ci_connector_ops/pipelines/actions/environments.py#L649
# If you need to add a custom logic to build your connector image, you can do it by adding a finalize_build.sh or finalize_build.py script in the connector folder.
# Please reach out to the Connectors Operations team if you have any question.
ARG JDK_VERSION=17.0.8
FROM amazoncorretto:${JDK_VERSION}
COPY --from=airbyte/integration-base:dev /airbyte /airbyte
RUN yum update -y && yum install -y tar openssl && yum clean all
WORKDIR /airbyte
# Add the Datadog Java APM agent
ADD https://dtdg.co/latest-java-tracer dd-java-agent.jar
COPY javabase.sh .
COPY run_with_normalization.sh .
# airbyte base commands
ENV AIRBYTE_SPEC_CMD "/airbyte/javabase.sh --spec"
ENV AIRBYTE_CHECK_CMD "/airbyte/javabase.sh --check"
ENV AIRBYTE_DISCOVER_CMD "/airbyte/javabase.sh --discover"
ENV AIRBYTE_READ_CMD "/airbyte/javabase.sh --read"
ENV AIRBYTE_WRITE_CMD "/airbyte/javabase.sh --write"
ENV AIRBYTE_ENTRYPOINT "/airbyte/base.sh"
ENTRYPOINT ["/airbyte/base.sh"]
LABEL io.airbyte.version=0.1.2
LABEL io.airbyte.name=airbyte/integration-base-java

View File

@@ -0,0 +1,3 @@
plugins {
id 'airbyte-docker-legacy'
}

View File

@@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -e
# if IS_CAPTURE_HEAP_DUMP_ON_ERROR is set to true, then will capture Heap dump on OutOfMemory error
if [[ $IS_CAPTURE_HEAP_DUMP_ON_ERROR = true ]]; then
arrayOfSupportedConnectors=("source-postgres" "source-mssql" "source-mysql" )
# The heap dump would be captured only in case when java-based connector fails with OutOfMemory error
if [[ " ${arrayOfSupportedConnectors[*]} " =~ " $APPLICATION " ]]; then
JAVA_OPTS=$JAVA_OPTS" -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/data/dump.hprof"
export JAVA_OPTS
echo "Added JAVA_OPTS=$JAVA_OPTS"
echo "APPLICATION=$APPLICATION"
fi
fi
#30781 - Allocate 32KB for log4j appender buffer to ensure that each line is logged in a single println
JAVA_OPTS=$JAVA_OPTS" -Dlog4j.encoder.byteBufferSize=32768 -Dlog4j2.configurationFile=log4j2.xml"
#needed because we make ThreadLocal.get(Thread) accessible in IntegrationRunner.stopOrphanedThreads
JAVA_OPTS=$JAVA_OPTS" --add-opens=java.base/java.lang=ALL-UNNAMED"
# tell jooq to be quiet (https://stackoverflow.com/questions/28272284/how-to-disable-jooqs-self-ad-message-in-3-4)
JAVA_OPTS=$JAVA_OPTS" -Dorg.jooq.no-logo=true -Dorg.jooq.no-tips=true"
export JAVA_OPTS
# Wrap run script in a script so that we can lazy evaluate the value of APPLICATION. APPLICATION is
# set by the dockerfile that inherits base-java, so it cannot be evaluated when base-java is built.
# We also need to make sure that stdin of the script is piped to the stdin of the java application.
if [[ $A = --write ]]; then
cat <&0 | /airbyte/bin/"$APPLICATION" "$@"
else
/airbyte/bin/"$APPLICATION" "$@"
fi

View File

@@ -0,0 +1,61 @@
#!/bin/bash
# Intentionally no set -e, because we want to run normalization even if the destination fails
set -o pipefail
/airbyte/base.sh $@
destination_exit_code=$?
echo '{"type": "LOG","log":{"level":"INFO","message":"Destination process done (exit code '"$destination_exit_code"')"}}'
# store original args
args=$@
while [ $# -ne 0 ]; do
case "$1" in
--config)
CONFIG_FILE="$2"
shift 2
;;
*)
# move on
shift
;;
esac
done
# restore original args after shifts
set -- $args
USE_1S1T_FORMAT="false"
if [[ -s "$CONFIG_FILE" ]]; then
USE_1S1T_FORMAT=$(jq -r '.use_1s1t_format' "$CONFIG_FILE")
fi
if test "$1" != 'write'
then
normalization_exit_code=0
elif test "$NORMALIZATION_TECHNIQUE" = 'LEGACY' && test "$USE_1S1T_FORMAT" != "true"
then
echo '{"type": "LOG","log":{"level":"INFO","message":"Starting in-connector normalization"}}'
# Normalization tries to create this file from the connector config and crashes if it already exists
# so just nuke it and let normalization recreate it.
# Use -f to avoid error if it doesn't exist, since it's only created for certain SSL modes.
rm -f ca.crt
# the args in a write command are `write --catalog foo.json --config bar.json`
# so if we remove the `write`, we can just pass the rest directly into normalization
/airbyte/entrypoint.sh run ${@:2} --integration-type $AIRBYTE_NORMALIZATION_INTEGRATION | java -cp "/airbyte/lib/*" io.airbyte.cdk.integrations.destination.normalization.NormalizationLogParser
normalization_exit_code=$?
echo '{"type": "LOG","log":{"level":"INFO","message":"In-connector normalization done (exit code '"$normalization_exit_code"')"}}'
else
echo '{"type": "LOG","log":{"level":"INFO","message":"Skipping in-connector normalization"}}'
normalization_exit_code=0
fi
if test $destination_exit_code -ne 0
then
exit $destination_exit_code
elif test $normalization_exit_code -ne 0
then
exit $normalization_exit_code
else
exit 0
fi

View File

@@ -0,0 +1,13 @@
*
!Dockerfile
!entrypoint.sh
!build/sshtunneling.sh
!setup.py
!normalization
!dbt-project-template
!dbt-project-template-mssql
!dbt-project-template-mysql
!dbt-project-template-oracle
!dbt-project-template-clickhouse
!dbt-project-template-snowflake
!dbt-project-template-redshift

View File

@@ -0,0 +1,51 @@
build/
logs/
dbt-project-template/models/generated/
dbt-project-template/test_output.log
dbt_modules/
secrets/
dist/
integration_tests/normalization_test_output/*/*/macros
integration_tests/normalization_test_output/*/*/tests
integration_tests/normalization_test_output/**/*.json
integration_tests/normalization_test_output/**/*.log
integration_tests/normalization_test_output/**/*.md
integration_tests/normalization_test_output/**/*.sql
integration_tests/normalization_test_output/**/*.yml
!integration_tests/normalization_test_output/**/*dbt_project.yml
!integration_tests/normalization_test_output/**/generated/sources.yml
# We keep a minimal/restricted subset of sql files for all destinations to avoid noise in diff
# Simple Streams
!integration_tests/normalization_test_output/**/dedup_exchange_rate*.sql
!integration_tests/normalization_test_output/**/DEDUP_EXCHANGE_RATE*.sql
!integration_tests/normalization_test_output/**/exchange_rate.sql
!integration_tests/normalization_test_output/**/EXCHANGE_RATE.sql
!integration_tests/normalization_test_output/**/test_simple_streams/first_output/airbyte_views/**/multiple_column_names_conflicts_stg.sql
# Nested Streams
# Parent table
!integration_tests/normalization_test_output/**/nested_stream_with*_names_ab*.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_names_scd.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_names.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_NAMES_AB*.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_NAMES_SCD.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_NAMES.sql
# Nested table
!integration_tests/normalization_test_output/**/nested_stream_with_*_partition_ab1.sql
!integration_tests/normalization_test_output/**/nested_stream_with_*_data_ab1.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_partition_scd.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_data_scd.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_partition.sql
!integration_tests/normalization_test_output/**/nested_stream_with*_data.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH_*_PARTITION_AB1.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH_*_DATA_AB1.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_PARTITION_SCD.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_DATA_SCD.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_PARTITION.sql
!integration_tests/normalization_test_output/**/NESTED_STREAM_WITH*_DATA.sql
# but we keep all sql files for Postgres
!integration_tests/normalization_test_output/postgres/**/*.sql
integration_tests/normalization_test_output/postgres/**/dbt_data_tests
integration_tests/normalization_test_output/postgres/**/dbt_schema_tests

View File

@@ -0,0 +1,37 @@
FROM fishtownanalytics/dbt:1.0.0
COPY --from=airbyte/base-airbyte-protocol-python:0.1.1 /airbyte /airbyte
# Install SSH Tunneling dependencies
RUN apt-get update && apt-get install -y jq sshpass
WORKDIR /airbyte
COPY entrypoint.sh .
COPY build/sshtunneling.sh .
WORKDIR /airbyte/normalization_code
COPY normalization ./normalization
COPY setup.py .
COPY dbt-project-template/ ./dbt-template/
# Install python dependencies
WORKDIR /airbyte/base_python_structs
# workaround for https://github.com/yaml/pyyaml/issues/601
# this should be fixed in the airbyte/base-airbyte-protocol-python image
RUN pip install "Cython<3.0" "pyyaml==5.4" --no-build-isolation
RUN pip install .
WORKDIR /airbyte/normalization_code
RUN pip install .
WORKDIR /airbyte/normalization_code/dbt-template/
# Download external dbt dependencies
RUN dbt deps
WORKDIR /airbyte
ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh"
ENTRYPOINT ["/airbyte/entrypoint.sh"]
LABEL io.airbyte.version=0.4.3
LABEL io.airbyte.name=airbyte/normalization

View File

@@ -0,0 +1,57 @@
plugins {
id 'airbyte-docker-legacy'
id 'airbyte-python'
}
dependencies {
testFixtures(project(':airbyte-cdk:java:airbyte-cdk:airbyte-cdk-dependencies'))
}
// we need to access the sshtunneling script from airbyte-workers for ssh support
def copySshScript = tasks.register('copySshScript', Copy) {
from "${project(':airbyte-cdk:java:airbyte-cdk:airbyte-cdk-dependencies').buildDir}/resources/testFixtures"
into "${buildDir}"
include "sshtunneling.sh"
}
copySshScript.configure {
dependsOn project(':airbyte-cdk:java:airbyte-cdk:airbyte-cdk-dependencies').tasks.named('processTestFixturesResources')
}
// make sure the copy task above worked (if it fails, it fails silently annoyingly)
def checkSshScriptCopy = tasks.register('checkSshScriptCopy') {
doFirst {
assert file("${buildDir}/sshtunneling.sh").exists() : "Copy of sshtunneling.sh failed."
}
}
checkSshScriptCopy.configure {
dependsOn copySshScript
}
def generate = tasks.register('generate')
generate.configure {
dependsOn checkSshScriptCopy
}
tasks.named('check').configure {
dependsOn generate
}
tasks.named("jar").configure {
dependsOn copySshScript
}
[
'bigquery',
'mysql',
'postgres',
'redshift',
'snowflake',
'oracle',
'mssql',
'clickhouse',
'tidb',
].each {destinationName ->
tasks.matching { it.name == 'integrationTestPython' }.configureEach {
dependsOn project(":airbyte-integrations:connectors:destination-$destinationName").tasks.named('assemble')
}
}

View File

@@ -0,0 +1,36 @@
FROM ghcr.io/dbt-labs/dbt-core:1.3.1
COPY --from=airbyte/base-airbyte-protocol-python:0.1.1 /airbyte /airbyte
# Install SSH Tunneling dependencies
RUN apt-get update && apt-get install -y jq sshpass
WORKDIR /airbyte
COPY entrypoint.sh .
COPY build/sshtunneling.sh .
WORKDIR /airbyte/normalization_code
COPY normalization ./normalization
COPY setup.py .
COPY dbt-project-template/ ./dbt-template/
# Install python dependencies
WORKDIR /airbyte/base_python_structs
# workaround for https://github.com/yaml/pyyaml/issues/601
# this should be fixed in the airbyte/base-airbyte-protocol-python image
RUN pip install "Cython<3.0" "pyyaml==5.4" --no-build-isolation
RUN pip install .
WORKDIR /airbyte/normalization_code
RUN pip install .
WORKDIR /airbyte/normalization_code/dbt-template/
RUN pip install "dbt-clickhouse>=1.4.0"
# Download external dbt dependencies
RUN dbt deps
WORKDIR /airbyte
ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh"
ENTRYPOINT ["/airbyte/entrypoint.sh"]
LABEL io.airbyte.name=airbyte/normalization-clickhouse

View File

@@ -0,0 +1,65 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: true
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
# ephemeral materialization isn't supported in ClickHouse yet
+materialized: view
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
# schema change test isn't supported in ClickHouse yet
+on_schema_change: "ignore"
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order: ["airbyte_utils", "dbt_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,63 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order: ["airbyte_utils", "dbt_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,61 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
vars:
dbt_utils_dispatch_list: ["airbyte_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,63 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization"s
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won"t need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
# incremental is not enabled for MySql yet
#+materialized: incremental
+materialized: table
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
vars:
dbt_utils_dispatch_list: ["airbyte_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,61 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `source-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
source-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
data-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
modules-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: false
schema: false
identifier: false
# You can define configurations for models in the `source-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
# incremental is not enabled for Oracle yet
#+materialized: incremental
+materialized: table
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
vars:
dbt_utils_dispatch_list: ["airbyte_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.6.4

View File

@@ -0,0 +1,66 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
+transient: false
# https://docs.aws.amazon.com/redshift/latest/dg/super-configurations.html
+pre-hook: "SET enable_case_sensitive_identifier to TRUE"
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order: ["airbyte_utils", "dbt_utils"]

View File

@@ -0,0 +1,64 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
+transient: false
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order: ["airbyte_utils", "dbt_utils"]

View File

@@ -0,0 +1,61 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization"s
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won"t need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
vars:
dbt_utils_dispatch_list: ["airbyte_utils"]

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,19 @@
## Installing dbt
1. Activate your venv and run `pip3 install dbt`
1. Copy `airbyte-normalization/sample_files/profiles.yml` over to `~/.dbt/profiles.yml`
1. Edit to configure your profiles accordingly
## Running dbt
1. `cd airbyte-normalization`
1. You can now run dbt commands, to check the setup is fine: `dbt debug`
1. To build the dbt tables in your warehouse: `dbt run`
## Running dbt from Airbyte generated config
1. You can also change directory (`cd /tmp/dev_root/workspace/1/0/normalize` for example) to one of the workspace generated by Airbyte within one of the `normalize` folder.
1. You should find `profiles.yml` and a bunch of other dbt files/folders created there.
1. To check everything is setup properly: `dbt debug --profiles-dir=$(pwd) --project-dir=$(pwd)`
1. You can modify the `.sql` files and run `dbt run --profiles-dir=$(pwd) --project-dir=$(pwd)` too
1. You can inspect compiled dbt `.sql` files before they are run in the destination engine in `normalize/build/compiled` or `normalize/build/run` folders

View File

@@ -0,0 +1,63 @@
# This file is necessary to install dbt-utils with dbt deps
# the content will be overwritten by the transform function
# Name your package! Package names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: "airbyte_utils"
version: "1.0"
config-version: 2
# This setting configures which "profile" dbt uses for this project. Profiles contain
# database connection information, and should be configured in the ~/.dbt/profiles.yml file
profile: "normalize"
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
model-paths: ["models"]
docs-paths: ["docs"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
seed-paths: ["data"]
macro-paths: ["macros"]
target-path: "../build" # directory which will store compiled SQL files
log-path: "../logs" # directory which will store DBT logs
packages-install-path: "/dbt" # directory which will store external DBT dependencies
clean-targets: # directories to be removed by `dbt clean`
- "build"
- "dbt_modules"
quoting:
database: true
# Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785)
# all schemas should be unquoted
schema: false
identifier: true
# You can define configurations for models in the `model-paths` directory here.
# Using these configurations, you can enable or disable models, change how they
# are materialized, and more!
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order: ["airbyte_utils", "dbt_utils"]

View File

@@ -0,0 +1,19 @@
{% macro clean_tmp_tables(schemas) -%}
{{ adapter.dispatch('clean_tmp_tables')(schemas) }}
{%- endmacro %}
-- default
{% macro default__clean_tmp_tables(schemas) -%}
{% do exceptions.warn("\tINFO: CLEANING TEST LEFTOVERS IS NOT IMPLEMENTED FOR THIS DESTINATION. CONSIDER TO REMOVE TEST TABLES MANUALY.\n") %}
{%- endmacro %}
-- for redshift
{% macro redshift__clean_tmp_tables(schemas) %}
{%- for tmp_schema in schemas -%}
{% do log("\tDROP SCHEMA IF EXISTS " ~ tmp_schema, info=True) %}
{%- set drop_query -%}
drop schema if exists {{ tmp_schema }} cascade;
{%- endset -%}
{%- do run_query(drop_query) -%}
{%- endfor -%}
{% endmacro %}

View File

@@ -0,0 +1,173 @@
{#
Adapter Macros for the following functions:
- Bigquery: unnest() -> https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening-arrays-and-repeated-fields
- Snowflake: flatten() -> https://docs.snowflake.com/en/sql-reference/functions/flatten.html
- Redshift: -> https://blog.getdbt.com/how-to-unnest-arrays-in-redshift/
- postgres: unnest() -> https://www.postgresqltutorial.com/postgresql-array/
- MSSQL: openjson() > https://docs.microsoft.com/en-us/sql/relational-databases/json/validate-query-and-change-json-data-with-built-in-functions-sql-server?view=sql-server-ver15
- ClickHouse: ARRAY JOIN > https://clickhouse.com/docs/zh/sql-reference/statements/select/array-join/
#}
{# cross_join_unnest ------------------------------------------------- #}
{% macro cross_join_unnest(stream_name, array_col) -%}
{{ adapter.dispatch('cross_join_unnest')(stream_name, array_col) }}
{%- endmacro %}
{% macro default__cross_join_unnest(stream_name, array_col) -%}
{% do exceptions.warn("Undefined macro cross_join_unnest for this destination engine") %}
{%- endmacro %}
{% macro bigquery__cross_join_unnest(stream_name, array_col) -%}
cross join unnest({{ array_col }}) as {{ array_col }}
{%- endmacro %}
{% macro clickhouse__cross_join_unnest(stream_name, array_col) -%}
ARRAY JOIN {{ array_col }}
{%- endmacro %}
{% macro oracle__cross_join_unnest(stream_name, array_col) -%}
{% do exceptions.warn("Normalization does not support unnesting for Oracle yet.") %}
{%- endmacro %}
{% macro postgres__cross_join_unnest(stream_name, array_col) -%}
cross join jsonb_array_elements(
case jsonb_typeof({{ array_col }})
when 'array' then {{ array_col }}
else '[]' end
) as _airbyte_nested_data
{%- endmacro %}
{% macro mysql__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro tidb__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro duckdb__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro redshift__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro snowflake__cross_join_unnest(stream_name, array_col) -%}
cross join table(flatten({{ array_col }})) as {{ array_col }}
{%- endmacro %}
{% macro sqlserver__cross_join_unnest(stream_name, array_col) -%}
{# https://docs.microsoft.com/en-us/sql/relational-databases/json/convert-json-data-to-rows-and-columns-with-openjson-sql-server?view=sql-server-ver15#option-1---openjson-with-the-default-output #}
CROSS APPLY (
SELECT [value] = CASE
WHEN [type] = 4 THEN (SELECT [value] FROM OPENJSON([value]))
WHEN [type] = 5 THEN [value]
END
FROM OPENJSON({{ array_col }})
) AS {{ array_col }}
{%- endmacro %}
{# unnested_column_value -- this macro is related to unnest_cte #}
{% macro unnested_column_value(column_col) -%}
{{ adapter.dispatch('unnested_column_value')(column_col) }}
{%- endmacro %}
{% macro default__unnested_column_value(column_col) -%}
{{ column_col }}
{%- endmacro %}
{% macro postgres__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro snowflake__unnested_column_value(column_col) -%}
{{ column_col }}.value
{%- endmacro %}
{% macro redshift__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro mysql__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro tidb__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro duckdb__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro oracle__unnested_column_value(column_col) -%}
{{ column_col }}
{%- endmacro %}
{% macro sqlserver__unnested_column_value(column_col) -%}
{# unnested array/sub_array will be located in `value` column afterwards, we need to address to it #}
{{ column_col }}.value
{%- endmacro %}
{# unnest_cte ------------------------------------------------- #}
{% macro unnest_cte(from_table, stream_name, column_col) -%}
{{ adapter.dispatch('unnest_cte')(from_table, stream_name, column_col) }}
{%- endmacro %}
{% macro default__unnest_cte(from_table, stream_name, column_col) -%}{%- endmacro %}
{% macro redshift__unnest_cte(from_table, stream_name, column_col) -%}
{# -- based on https://docs.aws.amazon.com/redshift/latest/dg/query-super.html #}
with joined as (
select
table_alias._airbyte_{{ stream_name }}_hashid as _airbyte_hashid,
_airbyte_nested_data
from {{ from_table }} as table_alias, table_alias.{{ column_col }} as _airbyte_nested_data
)
{%- endmacro %}
{% macro mysql__unnest_cte(from_table, stream_name, column_col) -%}
{%- if not execute -%}
{{ return('') }}
{% endif %}
{%- call statement('max_json_array_length', fetch_result=True) -%}
with max_value as (
select max(json_length({{ column_col }})) as max_number_of_items
from {{ from_table }}
)
select
case when max_number_of_items is not null and max_number_of_items > 1
then max_number_of_items
else 1 end as max_number_of_items
from max_value
{%- endcall -%}
{%- set max_length = load_result('max_json_array_length') -%}
with numbers as (
{{ dbt_utils.generate_series(max_length["data"][0][0]) }}
),
joined as (
select
_airbyte_{{ stream_name }}_hashid as _airbyte_hashid,
{# -- json_extract(column_col, '$[i][0]') as _airbyte_nested_data #}
json_extract({{ column_col }}, concat("$[", numbers.generated_number - 1, "][0]")) as _airbyte_nested_data
from {{ from_table }}
cross join numbers
-- only generate the number of records in the cross join that corresponds
-- to the number of items in {{ from_table }}.{{ column_col }}
where numbers.generated_number <= json_length({{ column_col }})
)
{%- endmacro %}
{% macro tidb__unnest_cte(from_table, stream_name, column_col) -%}
{{ mysql__unnest_cte(from_table, stream_name, column_col) }}
{%- endmacro %}
{% macro duckdb__unnest_cte(from_table, stream_name, column_col) -%}
{{ mysql__unnest_cte(from_table, stream_name, column_col) }}
{%- endmacro %}

View File

@@ -0,0 +1,36 @@
{#
concat in dbt 0.6.4 used to work fine for bigquery but the new implementaion in 0.7.3 is less scalable (can not handle too many columns)
Therefore, we revert the implementation here and add versions for missing destinations
#}
{% macro concat(fields) -%}
{{ adapter.dispatch('concat')(fields) }}
{%- endmacro %}
{% macro bigquery__concat(fields) -%}
{#-- concat() in SQL bigquery scales better with number of columns than using the '||' operator --#}
concat({{ fields|join(', ') }})
{%- endmacro %}
{% macro mysql__concat(fields) -%}
{#-- MySQL doesn't support the '||' operator as concatenation by default --#}
concat({{ fields|join(', ') }})
{%- endmacro %}
{% macro sqlserver__concat(fields) -%}
{#-- CONCAT() in SQL SERVER accepts from 2 to 254 arguments, we use batches for the main concat, to overcome the limit. --#}
{% set concat_chunks = [] %}
{% for chunk in fields|batch(253) -%}
{% set _ = concat_chunks.append( "concat(" ~ chunk|join(', ') ~ ",'')" ) %}
{% endfor %}
concat({{ concat_chunks|join(', ') }}, '')
{%- endmacro %}
{% macro tidb__concat(fields) -%}
concat({{ fields|join(', ') }})
{%- endmacro %}
{% macro duckdb__concat(fields) -%}
concat({{ fields|join(', ') }})
{%- endmacro %}

View File

@@ -0,0 +1,7 @@
{% macro mysql__current_timestamp() %}
CURRENT_TIMESTAMP
{% endmacro %}
{% macro oracle__current_timestamp() %}
CURRENT_TIMESTAMP
{% endmacro %}

View File

@@ -0,0 +1,394 @@
{# json ------------------------------------------------- #}
{%- macro type_json() -%}
{{ adapter.dispatch('type_json')() }}
{%- endmacro -%}
{% macro default__type_json() %}
string
{% endmacro %}
{%- macro redshift__type_json() -%}
super
{%- endmacro -%}
{% macro postgres__type_json() %}
jsonb
{% endmacro %}
{%- macro oracle__type_json() -%}
varchar2(4000)
{%- endmacro -%}
{% macro snowflake__type_json() %}
variant
{% endmacro %}
{%- macro mysql__type_json() -%}
json
{%- endmacro -%}
{%- macro sqlserver__type_json() -%}
NVARCHAR(max)
{%- endmacro -%}
{% macro clickhouse__type_json() %}
String
{% endmacro %}
{%- macro tidb__type_json() -%}
json
{%- endmacro -%}
{%- macro duckdb__type_json() -%}
json
{%- endmacro -%}
{# string ------------------------------------------------- #}
{%- macro mysql__type_string() -%}
char
{%- endmacro -%}
{%- macro oracle__type_string() -%}
varchar2(4000)
{%- endmacro -%}
{% macro sqlserver__type_string() %}
NVARCHAR(max)
{%- endmacro -%}
{%- macro clickhouse__type_string() -%}
String
{%- endmacro -%}
{#-- TODO: Remove this macro when dbt issue regarding unlimited varchars on postgres is resolved (https://github.com/dbt-labs/dbt-core/issues/5238) and we've upgraded to the latest version of dbt --#}
{%- macro postgres__type_string() -%}
text
{%- endmacro -%}
{%- macro tidb__type_string() -%}
char(1000)
{%- endmacro -%}
{%- macro duckdb__type_string() -%}
VARCHAR
{%- endmacro -%}
{# float ------------------------------------------------- #}
{% macro mysql__type_float() %}
float
{% endmacro %}
{% macro oracle__type_float() %}
float
{% endmacro %}
{% macro clickhouse__type_float() %}
Float64
{% endmacro %}
{% macro tidb__type_float() %}
float
{% endmacro %}
{% macro duckdb__type_float() %}
DOUBLE
{% endmacro %}
{# int ------------------------------------------------- #}
{% macro default__type_int() %}
int
{% endmacro %}
{% macro mysql__type_int() %}
signed
{% endmacro %}
{% macro oracle__type_int() %}
int
{% endmacro %}
{% macro clickhouse__type_int() %}
INT
{% endmacro %}
{% macro tidb__type_int() %}
signed
{% endmacro %}
{% macro duckdb__type_int() %}
INTEGER
{% endmacro %}
{# bigint ------------------------------------------------- #}
{% macro mysql__type_bigint() %}
signed
{% endmacro %}
{% macro oracle__type_bigint() %}
numeric
{% endmacro %}
{% macro clickhouse__type_bigint() %}
BIGINT
{% endmacro %}
{% macro tidb__type_bigint() %}
signed
{% endmacro %}
{% macro duckdb__type_bigint() %}
BIGINT
{% endmacro %}
{# numeric ------------------------------------------------- --#}
{% macro mysql__type_numeric() %}
float
{% endmacro %}
{% macro clickhouse__type_numeric() %}
Float64
{% endmacro %}
{% macro tidb__type_numeric() %}
float
{% endmacro %}
{% macro duckdb__type_numeric() %}
DOUBLE
{% endmacro %}
{# very_large_integer --------------------------------------- --#}
{#
Most databases don't have a true unbounded numeric datatype, so we use a really big numeric field.
Our type terminology unfortunately collides with DB terminology (i.e. "big_integer" means different things in different contexts)
so this macro needs to be called very_large_integer.
#}
{%- macro type_very_large_integer() -%}
{{ adapter.dispatch('type_very_large_integer')() }}
{%- endmacro -%}
{% macro default__type_very_large_integer() %}
numeric
{% endmacro %}
{% macro snowflake__type_very_large_integer() %}
numeric
{% endmacro %}
{% macro mysql__type_very_large_integer() %}
decimal(38, 0)
{% endmacro %}
{% macro clickhouse__type_very_large_integer() %}
decimal128(0)
{% endmacro %}
{% macro tidb__type_very_large_integer() %}
decimal(38, 0)
{% endmacro %}
{% macro duckdb__type_very_large_integer() %}
DECIMAL(38, 0)
{% endmacro %}
{# timestamp ------------------------------------------------- --#}
{% macro mysql__type_timestamp() %}
time
{% endmacro %}
{%- macro sqlserver__type_timestamp() -%}
{#-- in TSQL timestamp is really datetime --#}
{#-- https://docs.microsoft.com/en-us/sql/t-sql/functions/date-and-time-data-types-and-functions-transact-sql?view=sql-server-ver15#DateandTimeDataTypes --#}
datetime
{%- endmacro -%}
{% macro clickhouse__type_timestamp() %}
DateTime64
{% endmacro %}
{% macro tidb__type_timestamp() %}
time
{% endmacro %}
{% macro duckdb__type_timestamp() %}
TIMESTAMP
{% endmacro %}
{# timestamp with time zone ------------------------------------------------- #}
{%- macro type_timestamp_with_timezone() -%}
{{ adapter.dispatch('type_timestamp_with_timezone')() }}
{%- endmacro -%}
{% macro default__type_timestamp_with_timezone() %}
timestamp with time zone
{% endmacro %}
{% macro bigquery__type_timestamp_with_timezone() %}
timestamp
{% endmacro %}
{#-- MySQL doesnt allow cast operation with nullif to work with DATETIME and doesn't support storing of timezone so we have to use char --#}
{#-- https://bugs.mysql.com/bug.php?id=77805 --#}
{%- macro mysql__type_timestamp_with_timezone() -%}
char(1024)
{%- endmacro -%}
{% macro oracle__type_timestamp_with_timezone() %}
varchar2(4000)
{% endmacro %}
{%- macro sqlserver__type_timestamp_with_timezone() -%}
datetimeoffset
{%- endmacro -%}
{% macro redshift__type_timestamp_with_timezone() %}
TIMESTAMPTZ
{% endmacro %}
{% macro clickhouse__type_timestamp_with_timezone() %}
DateTime64
{% endmacro %}
{%- macro tidb__type_timestamp_with_timezone() -%}
char(1000)
{%- endmacro -%}
{%- macro duckdb__type_timestamp_with_timezone() -%}
TIMESTAMPTZ
{%- endmacro -%}
{# timestamp without time zone ------------------------------------------------- #}
{%- macro type_timestamp_without_timezone() -%}
{{ adapter.dispatch('type_timestamp_without_timezone')() }}
{%- endmacro -%}
{% macro default__type_timestamp_without_timezone() %}
timestamp
{% endmacro %}
{%- macro sqlserver__type_timestamp_without_timezone() -%}
{#-- in TSQL timestamp is really datetime or datetime2 --#}
{#-- https://docs.microsoft.com/en-us/sql/t-sql/functions/date-and-time-data-types-and-functions-transact-sql?view=sql-server-ver15#DateandTimeDataTypes --#}
datetime2
{%- endmacro -%}
{% macro bigquery__type_timestamp_without_timezone() %}
datetime
{% endmacro %}
{% macro oracle__type_timestamp_without_timezone() %}
varchar2(4000)
{% endmacro %}
{% macro redshift__type_timestamp_without_timezone() %}
TIMESTAMP
{% endmacro %}
{% macro tidb__type_timestamp_without_timezone() %}
datetime
{% endmacro %}
{% macro duckdb__type_timestamp_without_timezone() %}
TIMESTAMP
{% endmacro %}
{# time without time zone ------------------------------------------------- #}
{%- macro type_time_without_timezone() -%}
{{ adapter.dispatch('type_time_without_timezone')() }}
{%- endmacro -%}
{% macro default__type_time_without_timezone() %}
time
{% endmacro %}
{% macro oracle__type_time_without_timezone() %}
varchar2(4000)
{% endmacro %}
{% macro redshift__type_time_without_timezone() %}
TIME
{% endmacro %}
{% macro clickhouse__type_time_without_timezone() %}
String
{% endmacro %}
{% macro tidb__type_time_without_timezone() %}
time
{% endmacro %}
{% macro duckdb__type_time_without_timezone() %}
TIMESTAMP
{% endmacro %}
{# time with time zone ------------------------------------------------- #}
{%- macro type_time_with_timezone() -%}
{{ adapter.dispatch('type_time_with_timezone')() }}
{%- endmacro -%}
{% macro default__type_time_with_timezone() %}
time with time zone
{% endmacro %}
{%- macro mysql__type_time_with_timezone() -%}
char(1024)
{%- endmacro -%}
{%- macro sqlserver__type_time_with_timezone() -%}
NVARCHAR(max)
{%- endmacro -%}
{% macro bigquery__type_time_with_timezone() %}
STRING
{% endmacro %}
{% macro oracle__type_time_with_timezone() %}
varchar2(4000)
{% endmacro %}
{% macro snowflake__type_time_with_timezone() %}
varchar
{% endmacro %}
{% macro redshift__type_time_with_timezone() %}
TIMETZ
{% endmacro %}
{% macro clickhouse__type_time_with_timezone() %}
String
{% endmacro %}
{%- macro tidb__type_time_with_timezone() -%}
char(1000)
{%- endmacro -%}
{%- macro duckdb__type_time_with_timezone() -%}
TIMESTAMPTZ
{%- endmacro -%}
{# date ------------------------------------------------- #}
{%- macro type_date() -%}
{{ adapter.dispatch('type_date')() }}
{%- endmacro -%}
{% macro default__type_date() %}
date
{% endmacro %}
{% macro oracle__type_date() %}
varchar2(4000)
{% endmacro %}
{%- macro sqlserver__type_date() -%}
date
{%- endmacro -%}
{% macro clickhouse__type_date() %}
Date32
{% endmacro %}

View File

@@ -0,0 +1,7 @@
{% macro mysql__except() %}
{% do exceptions.warn("MySQL does not support EXCEPT operator") %}
{% endmacro %}
{% macro oracle__except() %}
minus
{% endmacro %}

View File

@@ -0,0 +1,5 @@
{# converting hash in varchar _macro #}
{% macro sqlserver__hash(field) -%}
convert(varchar(32), HashBytes('md5', coalesce(cast({{field}} as {{dbt_utils.type_string()}}), '')), 2)
{%- endmacro %}

View File

@@ -0,0 +1,317 @@
{#
Adapter Macros for the following functions:
- Bigquery: JSON_EXTRACT(json_string_expr, json_path_format) -> https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
- Snowflake: JSON_EXTRACT_PATH_TEXT( <column_identifier> , '<path_name>' ) -> https://docs.snowflake.com/en/sql-reference/functions/json_extract_path_text.html
- Redshift: json_extract_path_text('json_string', 'path_elem' [,'path_elem'[, ...] ] [, null_if_invalid ] ) -> https://docs.aws.amazon.com/redshift/latest/dg/JSON_EXTRACT_PATH_TEXT.html
- Postgres: json_extract_path_text(<from_json>, 'path' [, 'path' [, ...}}) -> https://www.postgresql.org/docs/12/functions-json.html
- MySQL: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://dev.mysql.com/doc/refman/8.0/en/json-search-functions.html
- ClickHouse: JSONExtractString(json_doc, 'path' [, 'path'] ...) -> https://clickhouse.com/docs/en/sql-reference/functions/json-functions/
- TiDB: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://docs.pingcap.com/tidb/stable/json-functions
- DuckDB: json_extract(json, 'path') note: If path is a LIST, the result will be a LIST of JSON -> https://duckdb.org/docs/extensions/json
#}
{# format_json_path -------------------------------------------------- #}
{% macro format_json_path(json_path_list) -%}
{{ adapter.dispatch('format_json_path')(json_path_list) }}
{%- endmacro %}
{% macro default__format_json_path(json_path_list) -%}
{{ '.' ~ json_path_list|join('.') }}
{%- endmacro %}
{% macro oracle__format_json_path(json_path_list) -%}
{{ '\'$."' ~ json_path_list|join('."') ~ '"\'' }}
{%- endmacro %}
{#
BigQuery has different JSONPath syntax depending on which function you call.
Most of our macros use the "legacy" JSON functions, so this function uses
the legacy syntax.
These paths look like: "$['foo']['bar']"
#}
{% macro bigquery__format_json_path(json_path_list) -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace('"', '\\"')) -%} {%- endif -%}
{%- endfor -%}
{{ '"$[\'' ~ str_list|join('\'][\'') ~ '\']"' }}
{%- endmacro %}
{#
For macros which use the newer JSON functions, define a new_format_json_path
macro which generates the correct path syntax.
These paths look like: '$."foo"."bar"'
#}
{% macro bigquery_new_format_json_path(json_path_list) -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace('\'', '\\\'')) -%} {%- endif -%}
{%- endfor -%}
{{ '\'$."' ~ str_list|join('"."') ~ '"\'' }}
{%- endmacro %}
{% macro postgres__format_json_path(json_path_list) -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace("'", "''")) -%} {%- endif -%}
{%- endfor -%}
{{ "'" ~ str_list|join("','") ~ "'" }}
{%- endmacro %}
{% macro mysql__format_json_path(json_path_list) -%}
{# -- '$."x"."y"."z"' #}
{{ "'$.\"" ~ json_path_list|join(".") ~ "\"'" }}
{%- endmacro %}
{% macro redshift__format_json_path(json_path_list) -%}
{%- set quote = '"' -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace(quote, quote + quote)) -%} {%- endif -%}
{%- endfor -%}
{{ quote ~ str_list|join(quote + "," + quote) ~ quote }}
{%- endmacro %}
{% macro snowflake__format_json_path(json_path_list) -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace("'", "''").replace('"', '""')) -%} {%- endif -%}
{%- endfor -%}
{{ "'\"" ~ str_list|join('"."') ~ "\"'" }}
{%- endmacro %}
{% macro sqlserver__format_json_path(json_path_list) -%}
{# -- '$."x"."y"."z"' #}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace("'", "''").replace('"', '\\"')) -%} {%- endif -%}
{%- endfor -%}
{{ "'$.\"" ~ str_list|join(".") ~ "\"'" }}
{%- endmacro %}
{% macro clickhouse__format_json_path(json_path_list) -%}
{%- set str_list = [] -%}
{%- for json_path in json_path_list -%}
{%- if str_list.append(json_path.replace("'", "''").replace('"', '\\"')) -%} {%- endif -%}
{%- endfor -%}
{{ "'" ~ str_list|join("','") ~ "'" }}
{%- endmacro %}
{% macro tidb__format_json_path(json_path_list) -%}
{# -- '$."x"."y"."z"' #}
{{ "'$.\"" ~ json_path_list|join(".") ~ "\"'" }}
{%- endmacro %}
{% macro duckdb__format_json_path(json_path_list) -%}
{# -- '$."x"."y"."z"' #}
{{ "'$.\"" ~ json_path_list|join(".") ~ "\"'" }}
{%- endmacro %}
{# json_extract ------------------------------------------------- #}
{% macro json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{{ adapter.dispatch('json_extract')(from_table, json_column, json_path_list, normalized_json_path) }}
{%- endmacro %}
{% macro default__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
json_extract({{ from_table}}.{{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro oracle__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
json_value({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro bigquery__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% else %}
json_extract({{ from_table}}.{{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% endif -%}
{%- endmacro %}
{% macro postgres__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
jsonb_extract_path({{ json_column }}, {{ format_json_path(json_path_list) }})
{% else %}
jsonb_extract_path({{ from_table }}.{{ json_column }}, {{ format_json_path(json_path_list) }})
{% endif -%}
{%- endmacro %}
{% macro mysql__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% else %}
json_extract({{ from_table }}.{{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% endif -%}
{%- endmacro %}
{% macro redshift__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() != '' -%}
{%- set json_column = from_table|string() + "." + json_column|string() -%}
{%- endif -%}
case when {{ json_column }}.{{ format_json_path(json_path_list) }} != '' then {{ json_column }}.{{ format_json_path(json_path_list) }} end
{%- endmacro %}
{% macro snowflake__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }})
{% else %}
get_path(parse_json({{ from_table }}.{{ json_column }}), {{ format_json_path(json_path_list) }})
{% endif -%}
{%- endmacro %}
{% macro sqlserver__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
json_query({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro clickhouse__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
JSONExtractRaw(assumeNotNull({{ json_column }}), {{ format_json_path(json_path_list) }})
{% else %}
JSONExtractRaw(assumeNotNull({{ from_table }}.{{ json_column }}), {{ format_json_path(json_path_list) }})
{% endif -%}
{%- endmacro %}
{% macro tidb__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% else %}
json_extract({{ from_table }}.{{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% endif -%}
{%- endmacro %}
{% macro duckdb__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%}
{%- if from_table|string() == '' %}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% else %}
json_extract({{ from_table }}.{{ json_column }}, {{ format_json_path(normalized_json_path) }})
{% endif -%}
{%- endmacro %}
{# json_extract_scalar ------------------------------------------------- #}
{% macro json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
{{ adapter.dispatch('json_extract_scalar')(json_column, json_path_list, normalized_json_path) }}
{%- endmacro %}
{% macro default__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_extract_scalar({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro oracle__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_value({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro bigquery__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_extract_scalar({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro postgres__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
jsonb_extract_path_text({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro mysql__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_value({{ json_column }}, {{ format_json_path(normalized_json_path) }} RETURNING CHAR)
{%- endmacro %}
{% macro redshift__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
case when {{ json_column }}.{{ format_json_path(json_path_list) }} != '' then {{ json_column }}.{{ format_json_path(json_path_list) }} end
{%- endmacro %}
{% macro snowflake__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
to_varchar(get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }}))
{%- endmacro %}
{% macro sqlserver__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_value({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro clickhouse__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
JSONExtractRaw(assumeNotNull({{ json_column }}), {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro tidb__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
IF(
JSON_UNQUOTE(JSON_EXTRACT({{ json_column }}, {{ format_json_path(normalized_json_path) }})) = 'null',
NULL,
JSON_UNQUOTE(JSON_EXTRACT({{ json_column }}, {{ format_json_path(normalized_json_path) }}))
)
{%- endmacro %}
{% macro duckdb__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%}
json_extract_string({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{# json_extract_array ------------------------------------------------- #}
{% macro json_extract_array(json_column, json_path_list, normalized_json_path) -%}
{{ adapter.dispatch('json_extract_array')(json_column, json_path_list, normalized_json_path) }}
{%- endmacro %}
{% macro default__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_extract_array({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro oracle__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_value({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro bigquery__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_extract_array({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro postgres__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
jsonb_extract_path({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro mysql__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro redshift__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
{{ json_column }}.{{ format_json_path(json_path_list) }}
{%- endmacro %}
{% macro snowflake__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro sqlserver__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_query({{ json_column }}, {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro clickhouse__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
JSONExtractArrayRaw(assumeNotNull({{ json_column }}), {{ format_json_path(json_path_list) }})
{%- endmacro %}
{% macro tidb__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{% macro duckdb__json_extract_array(json_column, json_path_list, normalized_json_path) -%}
json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }})
{%- endmacro %}
{# json_extract_string_array ------------------------------------------------- #}
{% macro json_extract_string_array(json_column, json_path_list, normalized_json_path) -%}
{{ adapter.dispatch('json_extract_string_array')(json_column, json_path_list, normalized_json_path) }}
{%- endmacro %}
{% macro default__json_extract_string_array(json_column, json_path_list, normalized_json_path) -%}
{{ json_extract_array(json_column, json_path_list, normalized_json_path) }}
{%- endmacro %}
{#
See https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_extract_string_array
BigQuery does not allow NULL entries in REPEATED fields, so we replace those with literal "NULL" strings.
#}
{% macro bigquery__json_extract_string_array(json_column, json_path_list, normalized_json_path) -%}
array(
select ifnull(x, "NULL")
from unnest(json_value_array({{ json_column }}, {{ bigquery_new_format_json_path(normalized_json_path) }})) as x
)
{%- endmacro %}

View File

@@ -0,0 +1,16 @@
{# quote ---------------------------------- #}
{% macro quote(column_name) -%}
{{ adapter.dispatch('quote')(column_name) }}
{%- endmacro %}
{% macro default__quote(column_name) -%}
adapter.quote(column_name)
{%- endmacro %}
{% macro oracle__quote(column_name) -%}
{{ '\"' ~ column_name ~ '\"'}}
{%- endmacro %}
{% macro clickhouse__quote(column_name) -%}
{{ '\"' ~ column_name ~ '\"'}}
{%- endmacro %}

View File

@@ -0,0 +1,25 @@
{# surrogate_key ---------------------------------- #}
{% macro oracle__surrogate_key(field_list) -%}
ora_hash(
{%- for field in field_list %}
{% if not loop.last %}
{{ field }} || '~' ||
{% else %}
{{ field }}
{% endif %}
{%- endfor %}
)
{%- endmacro %}
{% macro clickhouse__surrogate_key(field_list) -%}
assumeNotNull(hex(MD5(
{%- for field in field_list %}
{% if not loop.last %}
toString({{ field }}) || '~' ||
{% else %}
toString({{ field }})
{% endif %}
{%- endfor %}
)))
{%- endmacro %}

View File

@@ -0,0 +1,105 @@
{# boolean_to_string ------------------------------------------------- #}
{% macro boolean_to_string(boolean_column) -%}
{{ adapter.dispatch('boolean_to_string')(boolean_column) }}
{%- endmacro %}
{% macro default__boolean_to_string(boolean_column) -%}
{{ boolean_column }}
{%- endmacro %}
{% macro redshift__boolean_to_string(boolean_column) -%}
case when {{ boolean_column }} then 'true' else 'false' end
{%- endmacro %}
{# array_to_string ------------------------------------------------- #}
{% macro array_to_string(array_column) -%}
{{ adapter.dispatch('array_to_string')(array_column) }}
{%- endmacro %}
{% macro default__array_to_string(array_column) -%}
{{ array_column }}
{%- endmacro %}
{% macro bigquery__array_to_string(array_column) -%}
array_to_string({{ array_column }}, "|", "")
{%- endmacro %}
{% macro oracle__array_to_string(array_column) -%}
cast({{ array_column }} as varchar2(4000))
{%- endmacro %}
{% macro sqlserver__array_to_string(array_column) -%}
cast({{ array_column }} as {{dbt_utils.type_string()}})
{%- endmacro %}
{% macro redshift__array_to_string(array_column) -%}
json_serialize({{array_column}})
{%- endmacro %}
{# object_to_string ------------------------------------------------- #}
{% macro object_to_string(object_column) -%}
{{ adapter.dispatch('object_to_string')(object_column) }}
{%- endmacro %}
{% macro default__object_to_string(object_column) -%}
{{ object_column }}
{%- endmacro %}
{% macro redshift__object_to_string(object_column) -%}
json_serialize({{object_column}})
{%- endmacro %}
{# cast_to_boolean ------------------------------------------------- #}
{% macro cast_to_boolean(field) -%}
{{ adapter.dispatch('cast_to_boolean')(field) }}
{%- endmacro %}
{% macro default__cast_to_boolean(field) -%}
cast({{ field }} as boolean)
{%- endmacro %}
{# -- MySQL does not support cast function converting string directly to boolean (an alias of tinyint(1), https://dev.mysql.com/doc/refman/8.0/en/cast-functions.html#function_cast #}
{% macro mysql__cast_to_boolean(field) -%}
IF(lower({{ field }}) = 'true', true, false)
{%- endmacro %}
{# TiDB does not support cast string to boolean #}
{% macro tidb__cast_to_boolean(field) -%}
IF(lower({{ field }}) = 'true', true, false)
{%- endmacro %}
{% macro duckdb__cast_to_boolean(field) -%}
cast({{ field }} as boolean)
{%- endmacro %}
{% macro redshift__cast_to_boolean(field) -%}
cast({{ field }} as boolean)
{%- endmacro %}
{# -- MS SQL Server does not support converting string directly to boolean, it must be casted as bit #}
{% macro sqlserver__cast_to_boolean(field) -%}
cast({{ field }} as bit)
{%- endmacro %}
{# -- ClickHouse does not support converting string directly to Int8, it must go through int first #}
{% macro clickhouse__cast_to_boolean(field) -%}
IF(lower({{ field }}) = 'true', 1, 0)
{%- endmacro %}
{# empty_string_to_null ------------------------------------------------- #}
{% macro empty_string_to_null(field) -%}
{{ return(adapter.dispatch('empty_string_to_null')(field)) }}
{%- endmacro %}
{%- macro default__empty_string_to_null(field) -%}
nullif({{ field }}, '')
{%- endmacro %}
{%- macro duckdb__empty_string_to_null(field) -%}
nullif(nullif({{ field }}, 'null'), '')
{%- endmacro %}
{%- macro redshift__empty_string_to_null(field) -%}
nullif({{ field }}::varchar, '')
{%- endmacro %}

View File

@@ -0,0 +1,4 @@
-- see https://docs.getdbt.com/docs/building-a-dbt-project/building-models/using-custom-schemas/#an-alternative-pattern-for-generating-schema-names
{% macro generate_schema_name(custom_schema_name, node) -%}
{{ generate_schema_name_for_env(custom_schema_name, node) }}
{%- endmacro %}

View File

@@ -0,0 +1,61 @@
{#
These macros control how incremental models are updated in Airbyte's normalization step
- get_max_normalized_cursor retrieve the value of the last normalized data
- incremental_clause controls the predicate to filter on new data to process incrementally
#}
{% macro incremental_clause(col_emitted_at, tablename) -%}
{{ adapter.dispatch('incremental_clause')(col_emitted_at, tablename) }}
{%- endmacro %}
{%- macro default__incremental_clause(col_emitted_at, tablename) -%}
{% if is_incremental() %}
and coalesce(
cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }}) > (select max(cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }})) from {{ tablename }}),
{# -- if {{ col_emitted_at }} is NULL in either table, the previous comparison would evaluate to NULL, #}
{# -- so we coalesce and make sure the row is always returned for incremental processing instead #}
true)
{% endif %}
{%- endmacro -%}
{# -- see https://on-systems.tech/113-beware-dbt-incremental-updates-against-snowflake-external-tables/ #}
{%- macro snowflake__incremental_clause(col_emitted_at, tablename) -%}
{% if is_incremental() %}
{% if get_max_normalized_cursor(col_emitted_at, tablename) %}
and cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }}) >
cast('{{ get_max_normalized_cursor(col_emitted_at, tablename) }}' as {{ type_timestamp_with_timezone() }})
{% endif %}
{% endif %}
{%- endmacro -%}
{# -- see https://cloud.google.com/bigquery/docs/querying-partitioned-tables#best_practices_for_partition_pruning #}
{%- macro bigquery__incremental_clause(col_emitted_at, tablename) -%}
{% if is_incremental() %}
{% if get_max_normalized_cursor(col_emitted_at, tablename) %}
and cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }}) >
cast('{{ get_max_normalized_cursor(col_emitted_at, tablename) }}' as {{ type_timestamp_with_timezone() }})
{% endif %}
{% endif %}
{%- endmacro -%}
{%- macro sqlserver__incremental_clause(col_emitted_at, tablename) -%}
{% if is_incremental() %}
and ((select max(cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }})) from {{ tablename }}) is null
or cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }}) >
(select max(cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }})) from {{ tablename }}))
{% endif %}
{%- endmacro -%}
{% macro get_max_normalized_cursor(col_emitted_at, tablename) %}
{% if execute and is_incremental() %}
{% if env_var('INCREMENTAL_CURSOR', 'UNSET') == 'UNSET' %}
{% set query %}
select max(cast({{ col_emitted_at }} as {{ type_timestamp_with_timezone() }})) from {{ tablename }}
{% endset %}
{% set max_cursor = run_query(query).columns[0][0] %}
{% do return(max_cursor) %}
{% else %}
{% do return(env_var('INCREMENTAL_CURSOR')) %}
{% endif %}
{% endif %}
{% endmacro %}

View File

@@ -0,0 +1,34 @@
{% macro oracle__test_equal_rowcount(model, compare_model) %}
{#-- Needs to be set at parse time, before we return '' below --#}
{{ config(fail_calc = 'coalesce(diff_count, 0)') }}
{#-- Prevent querying of db in parsing mode. This works because this macro does not create any new refs. #}
{%- if not execute -%}
{{ return('') }}
{% endif %}
with a as (
select count(*) as count_a from {{ model }}
),
b as (
select count(*) as count_b from {{ compare_model }}
),
final as (
select
count_a,
count_b,
abs(count_a - count_b) as diff_count
from a
cross join b
)
select diff_count from final
{% endmacro %}

View File

@@ -0,0 +1,107 @@
{#
-- Adapted from https://github.com/dbt-labs/dbt-utils/blob/0-19-0-updates/macros/schema_tests/equality.sql
-- dbt-utils version: 0.6.4
-- This macro needs to be updated accordingly when dbt-utils is upgraded.
-- This is needed because MySQL does not support the EXCEPT operator!
#}
{% macro mysql__test_equality(model, compare_model, compare_columns=None) %}
{%- if not execute -%}
{{ return('') }}
{% endif %}
{%- do dbt_utils._is_relation(model, 'test_equality') -%}
{%- if not compare_columns -%}
{%- do dbt_utils._is_ephemeral(model, 'test_equality') -%}
{%- set compare_columns = adapter.get_columns_in_relation(model) | map(attribute='quoted') -%}
{%- endif -%}
{% set compare_cols_csv = compare_columns | join(', ') %}
with a as (
select * from {{ model }}
),
b as (
select * from {{ compare_model }}
),
a_minus_b as (
select {{ compare_cols_csv }} from a
where ({{ compare_cols_csv }}) not in
(select {{ compare_cols_csv }} from b)
),
b_minus_a as (
select {{ compare_cols_csv }} from b
where ({{ compare_cols_csv }}) not in
(select {{ compare_cols_csv }} from a)
),
unioned as (
select * from a_minus_b
union all
select * from b_minus_a
),
final as (
select (select count(*) from unioned) +
(select abs(
(select count(*) from a_minus_b) -
(select count(*) from b_minus_a)
))
as count
)
select count from final
{% endmacro %}
{% macro oracle__test_equality(model) %}
{#-- Prevent querying of db in parsing mode. This works because this macro does not create any new refs. #}
{%- if not execute -%}
{{ return('') }}
{% endif %}
-- setup
{%- do dbt_utils._is_relation(model, 'test_equality') -%}
{#-
If the compare_cols arg is provided, we can run this test without querying the
information schema this allows the model to be an ephemeral model
-#}
{%- set compare_columns = kwargs.get('compare_columns', None) -%}
{%- if not compare_columns -%}
{%- do dbt_utils._is_ephemeral(model, 'test_equality') -%}
{%- set compare_columns = adapter.get_columns_in_relation(model) | map(attribute='quoted') -%}
{%- endif -%}
{% set compare_model = kwargs.get('compare_model', kwargs.get('arg')) %}
{% set compare_cols_csv = compare_columns | join(', ') %}
with a as (
select * from {{ model }}
),
b as (
select * from {{ compare_model }}
),
a_minus_b as (
select {{compare_cols_csv}} from a
{{ dbt_utils.except() }}
select {{compare_cols_csv}} from b
),
b_minus_a as (
select {{compare_cols_csv}} from b
{{ dbt_utils.except() }}
select {{compare_cols_csv}} from a
),
unioned as (
select * from a_minus_b
union all
select * from b_minus_a
)
select count(*) from unioned
{% endmacro %}

View File

@@ -0,0 +1,51 @@
{#
This overrides the behavior of the macro `should_full_refresh` so full refresh are triggered if:
- the dbt cli is run with --full-refresh flag or the model is configured explicitly to full_refresh
- the column _airbyte_ab_id does not exists in the normalized tables and make sure it is well populated.
#}
{%- macro need_full_refresh(col_ab_id, target_table=this) -%}
{%- if not execute -%}
{{ return(false) }}
{%- endif -%}
{%- set found_column = [] %}
{%- set cols = adapter.get_columns_in_relation(target_table) -%}
{%- for col in cols -%}
{%- if col.column == col_ab_id -%}
{% do found_column.append(col.column) %}
{%- endif -%}
{%- endfor -%}
{%- if found_column -%}
{{ return(false) }}
{%- else -%}
{{ dbt_utils.log_info(target_table ~ "." ~ col_ab_id ~ " does not exist yet. The table will be created or rebuilt with dbt.full_refresh") }}
{{ return(true) }}
{%- endif -%}
{%- endmacro -%}
{%- macro should_full_refresh() -%}
{% set config_full_refresh = config.get('full_refresh') %}
{%- if config_full_refresh is none -%}
{% set config_full_refresh = flags.FULL_REFRESH %}
{%- endif -%}
{%- if not config_full_refresh -%}
{% set config_full_refresh = need_full_refresh(get_col_ab_id(), this) %}
{%- endif -%}
{% do return(config_full_refresh) %}
{%- endmacro -%}
{%- macro get_col_ab_id() -%}
{{ adapter.dispatch('get_col_ab_id')() }}
{%- endmacro -%}
{%- macro default__get_col_ab_id() -%}
_airbyte_ab_id
{%- endmacro -%}
{%- macro oracle__get_col_ab_id() -%}
"_AIRBYTE_AB_ID"
{%- endmacro -%}
{%- macro snowflake__get_col_ab_id() -%}
_AIRBYTE_AB_ID
{%- endmacro -%}

View File

@@ -0,0 +1,46 @@
{#
Similar to the star macro here: https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/star.sql
This star_intersect macro takes an additional 'intersect' relation as argument.
Its behavior is to select columns from both 'intersect' and 'from' relations with the following rules:
- if the columns are existing in both 'from' and the 'intersect' relations, then the column from 'intersect' is used
- if it's not in the both relation, then only the column in the 'from' relation is used
#}
{% macro star_intersect(from, intersect, from_alias=False, intersect_alias=False, except=[]) -%}
{%- do dbt_utils._is_relation(from, 'star_intersect') -%}
{%- do dbt_utils._is_ephemeral(from, 'star_intersect') -%}
{%- do dbt_utils._is_relation(intersect, 'star_intersect') -%}
{%- do dbt_utils._is_ephemeral(intersect, 'star_intersect') -%}
{#-- Prevent querying of db in parsing mode. This works because this macro does not create any new refs. #}
{%- if not execute -%}
{{ return('') }}
{% endif %}
{%- set include_cols = [] %}
{%- set cols = adapter.get_columns_in_relation(from) -%}
{%- set except = except | map("lower") | list %}
{%- for col in cols -%}
{%- if col.column|lower not in except -%}
{% do include_cols.append(col.column) %}
{%- endif %}
{%- endfor %}
{%- set include_intersect_cols = [] %}
{%- set intersect_cols = adapter.get_columns_in_relation(intersect) -%}
{%- for col in intersect_cols -%}
{%- if col.column|lower not in except -%}
{% do include_intersect_cols.append(col.column) %}
{%- endif %}
{%- endfor %}
{%- for col in include_cols %}
{%- if col in include_intersect_cols -%}
{%- if intersect_alias %}{{ intersect_alias }}.{% else %}{%- endif -%}{{ adapter.quote(col)|trim }}
{%- if not loop.last %},{{ '\n ' }}{% endif %}
{%- else %}
{%- if from_alias %}{{ from_alias }}.{% else %}{{ from }}.{%- endif -%}{{ adapter.quote(col)|trim }} as {{ adapter.quote(col)|trim }}
{%- if not loop.last %},{{ '\n ' }}{% endif %}
{%- endif %}
{%- endfor -%}
{%- endmacro %}

View File

@@ -0,0 +1,5 @@
# add dependencies. these will get pulled during the `dbt deps` process.
packages:
- git: "https://github.com/fishtown-analytics/dbt-utils.git"
revision: 0.8.2

View File

@@ -0,0 +1,3 @@
# This dockerfile only exists to pull and re-export this image converted to the local arch of this machine
# It is then consumed by the Dockerfile in this direcotry as "fishtownanalytics/dbt:1.0.0-dev"
FROM fishtownanalytics/dbt:1.0.0

View File

@@ -0,0 +1,66 @@
version: "3.7"
services:
normalization:
image: airbyte/normalization:${VERSION}
build:
dockerfile: Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-mssql:
image: airbyte/normalization-mssql:${VERSION}
build:
dockerfile: mssql.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-mysql:
image: airbyte/normalization-mysql:${VERSION}
build:
dockerfile: mysql.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-oracle:
image: airbyte/normalization-oracle:${VERSION}
build:
dockerfile: oracle.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-clickhouse:
image: airbyte/normalization-clickhouse:${VERSION}
build:
dockerfile: clickhouse.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-snowflake:
image: airbyte/normalization-snowflake:${VERSION}
build:
dockerfile: snowflake.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-redshift:
image: airbyte/normalization-redshift:${VERSION}
build:
dockerfile: redshift.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-tidb:
image: airbyte/normalization-tidb:${VERSION}
build:
dockerfile: tidb.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}
normalization-duckdb:
image: airbyte/normalization-duckdb:${VERSION}
build:
dockerfile: duckdb.Dockerfile
context: .
labels:
io.airbyte.git-revision: ${GIT_REVISION}

View File

@@ -0,0 +1,22 @@
version: "3.7"
# this file only exists so that we can easily check that all of these images exist in docker hub in check_images_exist.sh
services:
normalization:
image: airbyte/normalization:${VERSION}
normalization-mssql:
image: airbyte/normalization-mssql:${VERSION}
normalization-mysql:
image: airbyte/normalization-mysql:${VERSION}
normalization-oracle:
image: airbyte/normalization-oracle:${VERSION}
normalization-clickhouse:
image: airbyte/normalization-clickhouse:${VERSION}
normalization-snowflake:
image: airbyte/normalization-snowflake:${VERSION}
normalization-redshift:
image: airbyte/normalization-redshift:${VERSION}
normalization-tidb:
image: airbyte/normalization-tidb:${VERSION}
normalization-duckdb:
image: airbyte/normalization-duckdb:${VERSION}

View File

@@ -0,0 +1,40 @@
FROM fishtownanalytics/dbt:1.0.0
COPY --from=airbyte/base-airbyte-protocol-python:0.1.1 /airbyte /airbyte
# Install SSH Tunneling dependencies
RUN apt-get update && apt-get install -y jq sshpass
WORKDIR /airbyte
COPY entrypoint.sh .
COPY build/sshtunneling.sh .
WORKDIR /airbyte/normalization_code
COPY normalization ./normalization
COPY setup.py .
COPY dbt-project-template/ ./dbt-template/
# Install python dependencies
WORKDIR /airbyte/base_python_structs
# workaround for https://github.com/yaml/pyyaml/issues/601
# this should be fixed in the airbyte/base-airbyte-protocol-python image
RUN pip install "Cython<3.0" "pyyaml==5.4" --no-build-isolation
RUN pip install .
WORKDIR /airbyte/normalization_code
RUN pip install .
RUN pip install dbt-duckdb==1.0.1
#adding duckdb manually (outside of setup.py - lots of errors)
RUN pip install duckdb
WORKDIR /airbyte/normalization_code/dbt-template/
# Download external dbt dependencies
RUN dbt deps
WORKDIR /airbyte
ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh"
ENTRYPOINT ["/airbyte/entrypoint.sh"]
LABEL io.airbyte.name=airbyte/normalization-duckdb

View File

@@ -0,0 +1,160 @@
#!/usr/bin/env bash
set -e # tells bash, in a script, to exit whenever anything returns a non-zero return value.
function echo2() {
echo >&2 "$@"
}
function error() {
echo2 "$@"
exit 1
}
function config_cleanup() {
# Remove config file as it might still contain sensitive credentials (for example,
# injected OAuth Parameters should not be visible to custom docker images running custom transformation operations)
rm -f "${CONFIG_FILE}"
}
function check_dbt_event_buffer_size() {
ret=0
dbt --help | grep -E -- '--event-buffer-size' && return
ret=1
}
PROJECT_DIR=$(pwd)
# How many commits should be downloaded from git to view history of a branch
GIT_HISTORY_DEPTH=5
# This function produces a working DBT project folder at the $PROJECT_DIR path so that dbt commands can be run
# from it successfully with the proper credentials. This can be accomplished by providing different custom variables
# to tweak the final project structure. For example, we can either use a user-provided base folder (git repo) or
# use the standard/base template folder to generate normalization models from.
function configuredbt() {
# We first need to generate a workspace folder for a dbt project to run from:
if [[ -z "${GIT_REPO}" ]]; then
# No git repository provided, use the dbt-template folder (shipped inside normalization docker image)
# as the base folder for dbt workspace
cp -r /airbyte/normalization_code/dbt-template/* "${PROJECT_DIR}"
echo "Running: transform-config --config ${CONFIG_FILE} --integration-type ${INTEGRATION_TYPE} --out ${PROJECT_DIR}"
set +e # allow script to continue running even if next commands fail to run properly
# Generate a profiles.yml file for the selected destination/integration type
transform-config --config "${CONFIG_FILE}" --integration-type "${INTEGRATION_TYPE}" --out "${PROJECT_DIR}"
if [[ -n "${CATALOG_FILE}" ]]; then
# If catalog file is provided, generate normalization models, otherwise skip it
echo "Running: transform-catalog --integration-type ${INTEGRATION_TYPE} --profile-config-dir ${PROJECT_DIR} --catalog ${CATALOG_FILE} --out ${PROJECT_DIR}/models/generated/ --json-column _airbyte_data"
transform-catalog --integration-type "${INTEGRATION_TYPE}" --profile-config-dir "${PROJECT_DIR}" --catalog "${CATALOG_FILE}" --out "${PROJECT_DIR}/models/generated/" --json-column "_airbyte_data"
TRANSFORM_EXIT_CODE=$?
if [ ${TRANSFORM_EXIT_CODE} -ne 0 ]; then
echo -e "\nShowing destination_catalog.json to diagnose/debug errors (${TRANSFORM_EXIT_CODE}):\n"
cat "${CATALOG_FILE}" | jq
exit ${TRANSFORM_EXIT_CODE}
fi
fi
set -e # tells bash, in a script, to exit whenever anything returns a non-zero return value.
else
trap config_cleanup EXIT
# Use git repository as a base workspace folder for dbt projects
if [[ -d git_repo ]]; then
rm -rf git_repo
fi
# Make a shallow clone of the latest git repository in the workspace folder
if [[ -z "${GIT_BRANCH}" ]]; then
# Checkout a particular branch from the git repository
echo "Running: git clone --depth ${GIT_HISTORY_DEPTH} --single-branch \$GIT_REPO git_repo"
git clone --depth ${GIT_HISTORY_DEPTH} --single-branch "${GIT_REPO}" git_repo
else
# No git branch specified, use the default branch of the git repository
echo "Running: git clone --depth ${GIT_HISTORY_DEPTH} -b ${GIT_BRANCH} --single-branch \$GIT_REPO git_repo"
git clone --depth ${GIT_HISTORY_DEPTH} -b "${GIT_BRANCH}" --single-branch "${GIT_REPO}" git_repo
fi
# Print few history logs to make it easier for users to verify the right code version has been checked out from git
echo "Last 5 commits in git_repo:"
(cd git_repo; git log --oneline -${GIT_HISTORY_DEPTH}; cd -)
# Generate a profiles.yml file for the selected destination/integration type
echo "Running: transform-config --config ${CONFIG_FILE} --integration-type ${INTEGRATION_TYPE} --out ${PROJECT_DIR}"
transform-config --config "${CONFIG_FILE}" --integration-type "${INTEGRATION_TYPE}" --out "${PROJECT_DIR}"
config_cleanup
fi
}
## todo: make it easy to select source or destination and validate based on selection by adding an integration type env variable.
function main() {
CMD="$1"
shift 1 || error "command not specified."
while [ $# -ne 0 ]; do
case "$1" in
--config)
CONFIG_FILE="$2"
shift 2
;;
--catalog)
CATALOG_FILE="$2"
shift 2
;;
--integration-type)
INTEGRATION_TYPE="$2"
shift 2
;;
--git-repo)
GIT_REPO="$2"
shift 2
;;
--git-branch)
GIT_BRANCH="$2"
shift 2
;;
*)
error "Unknown option: $1"
;;
esac
done
case "$CMD" in
run)
configuredbt
. /airbyte/sshtunneling.sh
openssh "${PROJECT_DIR}/ssh.json"
trap 'closessh' EXIT
set +e # allow script to continue running even if next commands fail to run properly
# We don't run dbt 1.0.x on all destinations (because their plugins don't support it yet)
# So we need to only pass `--event-buffer-size` if it's supported by DBT.
# Same goes for JSON formatted logging.
check_dbt_event_buffer_size
if [ "$ret" -eq 0 ]; then
echo -e "\nDBT >=1.0.0 detected; using 10K event buffer size\n"
dbt_additional_args="--event-buffer-size=10000 --log-format json"
else
dbt_additional_args=""
fi
# Run dbt to compile and execute the generated normalization models
dbt ${dbt_additional_args} run --profiles-dir "${PROJECT_DIR}" --project-dir "${PROJECT_DIR}"
DBT_EXIT_CODE=$?
if [ ${DBT_EXIT_CODE} -ne 0 ]; then
echo -e "\nDiagnosing dbt debug to check if destination is available for dbt and well configured (${DBT_EXIT_CODE}):\n"
dbt debug --profiles-dir "${PROJECT_DIR}" --project-dir "${PROJECT_DIR}"
DBT_DEBUG_EXIT_CODE=$?
if [ ${DBT_DEBUG_EXIT_CODE} -eq 0 ]; then
# dbt debug is successful, so the error must be somewhere else...
echo -e "\nForward dbt output logs to diagnose/debug errors (${DBT_DEBUG_EXIT_CODE}):\n"
cat "${PROJECT_DIR}/../logs/dbt.log"
fi
fi
closessh
exit ${DBT_EXIT_CODE}
;;
configure-dbt)
configuredbt
;;
*)
error "Unknown command: $CMD"
;;
esac
}
main "$@"

View File

@@ -0,0 +1,740 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import os
import pathlib
import random
import re
import socket
import string
import subprocess
import sys
import threading
import time
from copy import copy
from typing import Any, Callable, Dict, List, Union
import yaml
from normalization.destination_type import DestinationType
from normalization.transform_catalog.transform import read_yaml_config, write_yaml_config
from normalization.transform_config.transform import TransformConfig
NORMALIZATION_TEST_TARGET = "NORMALIZATION_TEST_TARGET"
NORMALIZATION_TEST_MSSQL_DB_PORT = "NORMALIZATION_TEST_MSSQL_DB_PORT"
NORMALIZATION_TEST_MYSQL_DB_PORT = "NORMALIZATION_TEST_MYSQL_DB_PORT"
NORMALIZATION_TEST_POSTGRES_DB_PORT = "NORMALIZATION_TEST_POSTGRES_DB_PORT"
NORMALIZATION_TEST_CLICKHOUSE_DB_PORT = "NORMALIZATION_TEST_CLICKHOUSE_DB_PORT"
NORMALIZATION_TEST_TIDB_DB_PORT = "NORMALIZATION_TEST_TIDB_DB_PORT"
NORMALIZATION_TEST_DUCKDB_DESTINATION_PATH = "NORMALIZATION_TEST_DUCKDB_DESTINATION_PATH"
class DbtIntegrationTest(object):
def __init__(self):
self.target_schema = "test_normalization"
self.container_prefix = f"test_normalization_db_{self.random_string(3)}"
self.db_names = []
@staticmethod
def generate_random_string(prefix: str) -> str:
return prefix + DbtIntegrationTest.random_string(5)
@staticmethod
def random_string(length: int) -> str:
return "".join(random.choice(string.ascii_lowercase) for i in range(length))
def set_target_schema(self, target_schema: str):
self.target_schema = target_schema
def setup_db(self, destinations_to_test: List[str]):
if DestinationType.POSTGRES.value in destinations_to_test:
self.setup_postgres_db()
if DestinationType.MYSQL.value in destinations_to_test:
self.setup_mysql_db()
if DestinationType.MSSQL.value in destinations_to_test:
self.setup_mssql_db()
if DestinationType.CLICKHOUSE.value in destinations_to_test:
self.setup_clickhouse_db()
if DestinationType.TIDB.value in destinations_to_test:
self.setup_tidb_db()
def setup_postgres_db(self):
start_db = True
if os.getenv(NORMALIZATION_TEST_POSTGRES_DB_PORT):
port = int(os.getenv(NORMALIZATION_TEST_POSTGRES_DB_PORT))
start_db = False
else:
port = self.find_free_port()
config = {
"host": "localhost",
"username": "integration-tests",
"password": "integration-tests",
"port": port,
"database": "postgres",
"schema": self.target_schema,
}
if start_db:
self.db_names.append("postgres")
print("Starting localhost postgres container for tests")
commands = [
"docker",
"run",
"--rm",
"--name",
f"{self.container_prefix}_postgres",
"-e",
f"POSTGRES_USER={config['username']}",
"-e",
f"POSTGRES_PASSWORD={config['password']}",
"-p",
f"{config['port']}:5432",
"-d",
"marcosmarxm/postgres-ssl:dev",
"-c",
"ssl=on",
"-c",
"ssl_cert_file=/var/lib/postgresql/server.crt",
"-c",
"ssl_key_file=/var/lib/postgresql/server.key",
]
print("Executing: ", " ".join(commands))
subprocess.call(commands)
print("....Waiting for Postgres DB to start...15 sec")
time.sleep(15)
if not os.path.exists("../secrets"):
os.makedirs("../secrets")
with open("../secrets/postgres.json", "w") as fh:
fh.write(json.dumps(config))
def setup_mysql_db(self):
start_db = True
if os.getenv(NORMALIZATION_TEST_MYSQL_DB_PORT):
port = int(os.getenv(NORMALIZATION_TEST_MYSQL_DB_PORT))
start_db = False
else:
port = self.find_free_port()
config = {
"host": "localhost",
"port": port,
"database": self.target_schema,
"username": "root",
"password": "",
}
if start_db:
self.db_names.append("mysql")
print("Starting localhost mysql container for tests")
commands = [
"docker",
"run",
"--rm",
"--name",
f"{self.container_prefix}_mysql",
"-e",
"MYSQL_ALLOW_EMPTY_PASSWORD=yes",
"-e",
"MYSQL_INITDB_SKIP_TZINFO=yes",
"-e",
f"MYSQL_DATABASE={config['database']}",
"-e",
"MYSQL_ROOT_HOST=%",
"-p",
f"{config['port']}:3306",
"-d",
"mysql/mysql-server",
]
print("Executing: ", " ".join(commands))
subprocess.call(commands)
print("....Waiting for MySQL DB to start...15 sec")
time.sleep(15)
if not os.path.exists("../secrets"):
os.makedirs("../secrets")
with open("../secrets/mysql.json", "w") as fh:
fh.write(json.dumps(config))
def setup_mssql_db(self):
start_db = True
if os.getenv(NORMALIZATION_TEST_MSSQL_DB_PORT):
port = int(os.getenv(NORMALIZATION_TEST_MSSQL_DB_PORT))
start_db = False
else:
port = self.find_free_port()
config = {
"host": "localhost",
"username": "SA",
"password": "MyStr0ngP@ssw0rd",
"port": port,
"database": self.target_schema,
"schema": self.target_schema,
}
if start_db:
self.db_names.append("mssql")
print("Starting localhost MS SQL Server container for tests")
command_start_container = [
"docker",
"run",
"--rm",
"--name",
f"{self.container_prefix}_mssql",
"-h",
f"{self.container_prefix}_mssql",
"-e",
"ACCEPT_EULA='Y'",
"-e",
f"SA_PASSWORD='{config['password']}'",
"-e",
"MSSQL_PID='Standard'",
"-p",
f"{config['port']}:1433",
"-d",
"mcr.microsoft.com/mssql/server:2019-GA-ubuntu-16.04",
]
# cmds & parameters
cmd_start_container = " ".join(command_start_container)
wait_sec = 30
# run the docker container
print("Executing: ", cmd_start_container)
subprocess.check_call(cmd_start_container, shell=True)
# wait for service is available
print(f"....Waiting for MS SQL Server to start...{wait_sec} sec")
time.sleep(wait_sec)
# Run additional commands to prepare the table
command_create_db = [
"docker",
"exec",
f"{self.container_prefix}_mssql",
"/opt/mssql-tools/bin/sqlcmd",
"-S",
config["host"],
"-U",
config["username"],
"-P",
config["password"],
"-Q",
f"CREATE DATABASE [{config['database']}]",
]
# create test db
print("Executing: ", " ".join(command_create_db))
subprocess.call(command_create_db)
if not os.path.exists("../secrets"):
os.makedirs("../secrets")
with open("../secrets/mssql.json", "w") as fh:
fh.write(json.dumps(config))
def setup_clickhouse_db(self):
"""
ClickHouse official JDBC driver uses HTTP port 8123.
Ref: https://altinity.com/blog/2019/3/15/clickhouse-networking-part-1
"""
start_db = True
port = 8123
if os.getenv(NORMALIZATION_TEST_CLICKHOUSE_DB_PORT):
port = int(os.getenv(NORMALIZATION_TEST_CLICKHOUSE_DB_PORT))
start_db = False
if start_db:
port = self.find_free_port()
config = {
"host": "localhost",
"port": port,
"database": self.target_schema,
"username": "default",
"password": "",
"ssl": False,
}
if start_db:
self.db_names.append("clickhouse")
print("Starting localhost clickhouse container for tests")
commands = [
"docker",
"run",
"--rm",
"--name",
f"{self.container_prefix}_clickhouse",
"--ulimit",
"nofile=262144:262144",
"-p",
f"{config['port']}:8123", # clickhouse JDBC driver use HTTP port
"-d",
# so far, only the latest version ClickHouse server image turned on
# window functions
"clickhouse/clickhouse-server:latest",
]
print("Executing: ", " ".join(commands))
subprocess.call(commands)
print("....Waiting for ClickHouse DB to start...15 sec")
time.sleep(15)
# Run additional commands to prepare the table
command_create_db = [
"docker",
"run",
"--rm",
"--link",
f"{self.container_prefix}_clickhouse:clickhouse-server",
"clickhouse/clickhouse-client:21.8.10.19",
"--host",
"clickhouse-server",
"--query",
f"CREATE DATABASE IF NOT EXISTS {config['database']}",
]
# create test db
print("Executing: ", " ".join(command_create_db))
subprocess.call(command_create_db)
if not os.path.exists("../secrets"):
os.makedirs("../secrets")
with open("../secrets/clickhouse.json", "w") as fh:
fh.write(json.dumps(config))
def setup_tidb_db(self):
start_db = True
if os.getenv(NORMALIZATION_TEST_TIDB_DB_PORT):
port = int(os.getenv(NORMALIZATION_TEST_TIDB_DB_PORT))
start_db = False
else:
port = self.find_free_port()
config = {
"host": "127.0.0.1",
"port": port,
"database": self.target_schema,
"schema": self.target_schema,
"username": "root",
"password": "",
"ssl": False,
}
if start_db:
self.db_names.append("tidb")
print("Starting tidb container for tests")
commands = [
"docker",
"run",
"--rm",
"--name",
f"{self.container_prefix}_tidb",
"-p",
f"{config['port']}:4000",
"-d",
"pingcap/tidb:v5.4.0",
]
print("Executing: ", " ".join(commands))
subprocess.call(commands)
print("....Waiting for TiDB to start...15 sec")
time.sleep(15)
command_create_db = [
"docker",
"run",
"--rm",
"--link",
f"{self.container_prefix}_tidb:tidb",
"arey/mysql-client",
"--host=tidb",
"--user=root",
"--port=4000",
f"--execute=CREATE DATABASE IF NOT EXISTS {self.target_schema}",
]
print("Executing: ", " ".join(command_create_db))
subprocess.call(command_create_db)
if not os.path.exists("../secrets"):
os.makedirs("../secrets")
with open("../secrets/tidb.json", "w") as fh:
fh.write(json.dumps(config))
@staticmethod
def find_free_port():
"""
Find an unused port to create a database listening on localhost to run destination-postgres
"""
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
addr = s.getsockname()
s.close()
return addr[1]
def tear_down_db(self):
for db_name in self.db_names:
print(f"Stopping localhost {db_name} container for tests")
try:
subprocess.call(["docker", "kill", f"{self.container_prefix}_{db_name}"])
except Exception as e:
print(f"WARN: Exception while shutting down {db_name}: {e}")
@staticmethod
def change_current_test_dir(request):
# This makes the test run whether it is executed from the tests folder (with pytest/gradle)
# or from the base-normalization folder (through pycharm)
integration_tests_dir = os.path.join(request.fspath.dirname, "integration_tests")
if os.path.exists(integration_tests_dir):
os.chdir(integration_tests_dir)
else:
os.chdir(request.fspath.dirname)
def generate_profile_yaml_file(
self, destination_type: DestinationType, test_root_dir: str, random_schema: bool = False
) -> Dict[str, Any]:
"""
Each destination requires different settings to connect to. This step generates the adequate profiles.yml
as described here: https://docs.getdbt.com/reference/profiles.yml
"""
config_generator = TransformConfig()
profiles_config = config_generator.read_json_config(f"../secrets/{destination_type.value.lower()}.json")
# Adapt credential file to look like destination config.json
if destination_type.value == DestinationType.BIGQUERY.value:
credentials = profiles_config["basic_bigquery_config"]
profiles_config = {
"credentials_json": json.dumps(credentials),
"dataset_id": self.target_schema,
"project_id": credentials["project_id"],
"dataset_location": "US",
}
elif destination_type.value == DestinationType.MYSQL.value:
profiles_config["database"] = self.target_schema
elif destination_type.value == DestinationType.REDSHIFT.value:
profiles_config["schema"] = self.target_schema
if random_schema:
profiles_config["schema"] = self.target_schema + "_" + "".join(random.choices(string.ascii_lowercase, k=5))
else:
profiles_config["schema"] = self.target_schema
if destination_type.value == DestinationType.CLICKHOUSE.value:
clickhouse_config = copy(profiles_config)
profiles_yaml = config_generator.transform(destination_type, clickhouse_config)
else:
profiles_yaml = config_generator.transform(destination_type, profiles_config)
config_generator.write_yaml_config(test_root_dir, profiles_yaml, "profiles.yml")
return profiles_config
@staticmethod
def run_destination_process(message_file: str, test_root_dir: str, commands: List[str]):
print("Executing: ", " ".join(commands))
with open(os.path.join(test_root_dir, "destination_output.log"), "ab") as f:
process = subprocess.Popen(commands, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def writer():
if os.path.exists(message_file):
with open(message_file, "rb") as input_data:
while True:
line = input_data.readline()
if not line:
break
if not line.startswith(b"//"):
process.stdin.write(line)
process.stdin.close()
thread = threading.Thread(target=writer)
thread.start()
for line in iter(process.stdout.readline, b""):
f.write(line)
sys.stdout.write(line.decode("utf-8"))
thread.join()
process.wait()
return process.returncode == 0
@staticmethod
def get_normalization_image(destination_type: DestinationType) -> str:
if DestinationType.MSSQL.value == destination_type.value:
return "airbyte/normalization-mssql:dev"
elif DestinationType.MYSQL.value == destination_type.value:
return "airbyte/normalization-mysql:dev"
elif DestinationType.ORACLE.value == destination_type.value:
return "airbyte/normalization-oracle:dev"
elif DestinationType.CLICKHOUSE.value == destination_type.value:
return "airbyte/normalization-clickhouse:dev"
elif DestinationType.SNOWFLAKE.value == destination_type.value:
return "airbyte/normalization-snowflake:dev"
elif DestinationType.REDSHIFT.value == destination_type.value:
return "airbyte/normalization-redshift:dev"
elif DestinationType.TIDB.value == destination_type.value:
return "airbyte/normalization-tidb:dev"
else:
return "airbyte/normalization:dev"
def dbt_check(self, destination_type: DestinationType, test_root_dir: str):
"""
Run the dbt CLI to perform transformations on the test raw data in the destination
"""
normalization_image: str = self.get_normalization_image(destination_type)
# Perform sanity check on dbt project settings
assert self.run_check_dbt_command(normalization_image, "debug", test_root_dir)
assert self.run_check_dbt_command(normalization_image, "deps", test_root_dir)
def dbt_run(self, destination_type: DestinationType, test_root_dir: str, force_full_refresh: bool = False):
"""
Run the dbt CLI to perform transformations on the test raw data in the destination
"""
normalization_image: str = self.get_normalization_image(destination_type)
# Compile dbt models files into destination sql dialect, then run the transformation queries
assert self.run_check_dbt_command(normalization_image, "run", test_root_dir, force_full_refresh)
def dbt_run_macro(self, destination_type: DestinationType, test_root_dir: str, macro: str, macro_args: str = None):
"""
Run the dbt CLI to perform transformations on the test raw data in the destination, using independent macro.
"""
normalization_image: str = self.get_normalization_image(destination_type)
# Compile dbt models files into destination sql dialect, then run the transformation queries
assert self.run_dbt_run_operation(normalization_image, test_root_dir, macro, macro_args)
def run_check_dbt_command(self, normalization_image: str, command: str, cwd: str, force_full_refresh: bool = False) -> bool:
"""
Run dbt subprocess while checking and counting for "ERROR", "FAIL" or "WARNING" printed in its outputs
"""
if any([normalization_image.startswith(x) for x in ["airbyte/normalization-oracle", "airbyte/normalization-clickhouse"]]):
dbtAdditionalArgs = []
else:
dbtAdditionalArgs = ["--event-buffer-size=10000"]
commands = (
[
"docker",
"run",
"--rm",
"--init",
"-v",
f"{cwd}:/workspace",
"-v",
f"{cwd}/build:/build",
"-v",
f"{cwd}/logs:/logs",
"-v",
f"{cwd}/build/dbt_packages:/dbt",
"--network",
"host",
"--entrypoint",
"/usr/local/bin/dbt",
"-i",
normalization_image,
]
+ dbtAdditionalArgs
+ [
command,
"--profiles-dir=/workspace",
"--project-dir=/workspace",
]
)
if force_full_refresh:
commands.append("--full-refresh")
command = f"{command} --full-refresh"
print("Executing: ", " ".join(commands))
print(f"Equivalent to: dbt {command} --profiles-dir={cwd} --project-dir={cwd}")
return self.run_check_dbt_subprocess(commands, cwd)
def run_dbt_run_operation(self, normalization_image: str, cwd: str, macro: str, macro_args: str = None) -> bool:
"""
Run dbt subprocess while checking and counting for "ERROR", "FAIL" or "WARNING" printed in its outputs
"""
args = ["--args", macro_args] if macro_args else []
commands = (
[
"docker",
"run",
"--rm",
"--init",
"-v",
f"{cwd}:/workspace",
"-v",
f"{cwd}/build:/build",
"-v",
f"{cwd}/logs:/logs",
"-v",
f"{cwd}/build/dbt_packages:/dbt",
"--network",
"host",
"--entrypoint",
"/usr/local/bin/dbt",
"-i",
normalization_image,
]
+ ["run-operation", macro]
+ args
+ ["--profiles-dir=/workspace", "--project-dir=/workspace"]
)
print("Executing: ", " ".join(commands))
print(f"Equivalent to: dbt run-operation {macro} --args {macro_args} --profiles-dir={cwd} --project-dir={cwd}")
return self.run_check_dbt_subprocess(commands, cwd)
def run_check_dbt_subprocess(self, commands: list, cwd: str):
error_count = 0
with open(os.path.join(cwd, "dbt_output.log"), "ab") as f:
process = subprocess.Popen(commands, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=os.environ)
for line in iter(lambda: process.stdout.readline(), b""):
f.write(line)
str_line = line.decode("utf-8")
sys.stdout.write(str_line)
# keywords to match lines as signaling errors
if "ERROR" in str_line or "FAIL" in str_line or "WARNING" in str_line:
# exception keywords in lines to ignore as errors (such as summary or expected warnings)
is_exception = False
for except_clause in [
"Done.", # DBT Summary
"PASS=", # DBT Summary
"Nothing to do.", # When no schema/data tests are setup
"Configuration paths exist in your dbt_project.yml", # When no cte / view are generated
"Error loading config file: .dockercfg: $HOME is not defined", # ignore warning
"depends on a node named 'disabled_test' which was not found", # Tests throwing warning because it is disabled
"The requested image's platform (linux/amd64) does not match the detected host platform "
+ "(linux/arm64/v8) and no specific platform was requested", # temporary patch until we publish images for arm64
]:
if except_clause in str_line:
is_exception = True
break
if not is_exception:
# count lines signaling an error/failure/warning
error_count += 1
process.wait()
message = (
f"{' '.join(commands)}\n\tterminated with return code {process.returncode} "
f"with {error_count} 'Error/Warning/Fail' mention(s)."
)
print(message)
assert error_count == 0, message
assert process.returncode == 0, message
if error_count > 0:
return False
return process.returncode == 0
@staticmethod
def copy_replace(src, dst, pattern=None, replace_value=None):
"""
Copies a file from src to dst replacing pattern by replace_value
Parameters
----------
src : string
Path to the source filename to copy from
dst : string
Path to the output filename to copy to
pattern
list of Patterns to replace inside the src file
replace_value
list of Values to replace by in the dst file
"""
file1 = open(src, "r") if isinstance(src, str) else src
file2 = open(dst, "w") if isinstance(dst, str) else dst
pattern = [pattern] if isinstance(pattern, str) else pattern
replace_value = [replace_value] if isinstance(replace_value, str) else replace_value
if replace_value and pattern:
if len(replace_value) != len(pattern):
raise Exception("Invalid parameters: pattern and replace_value" " have different sizes.")
rules = [(re.compile(regex, re.IGNORECASE), value) for regex, value in zip(pattern, replace_value)]
else:
rules = []
for line in file1:
if rules:
for rule in rules:
line = re.sub(rule[0], rule[1], line)
file2.write(line)
if isinstance(src, str):
file1.close()
if isinstance(dst, str):
file2.close()
@staticmethod
def get_test_targets() -> List[str]:
"""
Returns a list of destinations to run tests on.
if the environment variable NORMALIZATION_TEST_TARGET is set with a comma separated list of destination names,
then the tests are run only on that subsets of destinations
Otherwise tests are run against all destinations
"""
if os.getenv(NORMALIZATION_TEST_TARGET):
target_str = os.getenv(NORMALIZATION_TEST_TARGET)
return [d.value for d in {DestinationType.from_string(s.strip()) for s in target_str.split(",")}]
else:
return [d.value for d in DestinationType]
@staticmethod
def update_yaml_file(filename: str, callback: Callable):
config = read_yaml_config(filename)
updated, config = callback(config)
if updated:
write_yaml_config(config, filename)
def clean_tmp_tables(
self,
destination_type: Union[DestinationType, List[DestinationType]],
test_type: str,
tmp_folders: list = None,
git_versioned_tests: list = None,
):
"""
Cleans-up all temporary schemas created during the test session.
It parses the provided tmp_folders: List[str] or uses `git_versioned_tests` to find sources.yml files generated for the tests.
It gets target schemas created by the tests and removes them using custom scenario specified in
`dbt-project-template/macros/clean_tmp_tables.sql` macro.
REQUIREMENTS:
1) Idealy, the schemas should have unique names like: test_normalization_<some_random_string> to avoid conflicts.
2) The `clean_tmp_tables.sql` macro should have the specific macro for target destination to proceed.
INPUT ARGUMENTS:
:: destination_type : either single destination or list of destinations
:: test_type: either "ephemeral" or "normalization" should be supplied.
:: tmp_folders: should be supplied if test_type = "ephemeral", to get schemas from /build/normalization_test_output folders
:: git_versioned_tests: should be supplied if test_type = "normalization", to get schemas from integration_tests/normalization_test_output folders
EXAMPLE:
clean_up_args = {
"destination_type": [ DestinationType.REDSHIFT, DestinationType.POSTGRES, ... ]
"test_type": "normalization",
"git_versioned_tests": git_versioned_tests,
}
"""
path_to_sources: str = "/models/generated/sources.yml"
test_folders: dict = {}
source_files: dict = {}
schemas_to_remove: dict = {}
# collecting information about tmp_tables created for the test for each destination
for destination in destination_type:
test_folders[destination.value] = []
source_files[destination.value] = []
schemas_to_remove[destination.value] = []
# based on test_type select path to source files
if test_type == "ephemeral" or test_type == "test_reset_scd_overwrite":
if not tmp_folders:
raise TypeError("`tmp_folders` arg is not provided.")
for folder in tmp_folders:
if destination.value in folder:
test_folders[destination.value].append(folder)
source_files[destination.value].append(f"{folder}{path_to_sources}")
elif test_type == "normalization":
if not git_versioned_tests:
raise TypeError("`git_versioned_tests` arg is not provided.")
base_path = f"{pathlib.Path().absolute()}/integration_tests/normalization_test_output"
for test in git_versioned_tests:
test_root_dir: str = f"{base_path}/{destination.value}/{test}"
test_folders[destination.value].append(test_root_dir)
source_files[destination.value].append(f"{test_root_dir}{path_to_sources}")
else:
raise TypeError(f"\n`test_type`: {test_type} is not a registered, use `ephemeral` or `normalization` instead.\n")
# parse source.yml files from test folders to get schemas and table names created for the tests
for file in source_files[destination.value]:
source_yml = {}
try:
with open(file, "r") as source_file:
source_yml = yaml.safe_load(source_file)
except FileNotFoundError:
print(f"\n{destination.value}: {file} doesn't exist, consider to remove any temp_tables and schemas manually!\n")
pass
test_sources: list = source_yml.get("sources", []) if source_yml else []
for source in test_sources:
target_schema: str = source.get("name")
if target_schema not in schemas_to_remove[destination.value]:
schemas_to_remove[destination.value].append(target_schema)
# adding _airbyte_* tmp schemas to be removed
schemas_to_remove[destination.value].append(f"_airbyte_{target_schema}")
# cleaning up tmp_tables generated by the tests
for destination in destination_type:
if not schemas_to_remove[destination.value]:
print(f"\n\t{destination.value.upper()} DESTINATION: SKIP CLEANING, NOTHING TO REMOVE.\n")
else:
print(f"\n\t{destination.value.upper()} DESTINATION: CLEANING LEFTOVERS...\n")
print(f"\t{schemas_to_remove[destination.value]}\n")
test_root_folder = test_folders[destination.value][0]
args = json.dumps({"schemas": schemas_to_remove[destination.value]})
self.dbt_check(destination, test_root_folder)
self.dbt_run_macro(destination, test_root_folder, "clean_tmp_tables", args)

View File

@@ -0,0 +1,125 @@
name: airbyte_utils
version: '1.0'
config-version: 2
profile: normalize
model-paths:
- models
docs-paths:
- docs
analysis-paths:
- analysis
test-paths:
- tests
seed-paths:
- data
macro-paths:
- macros
target-path: ../build
log-path: ../logs
packages-install-path: /dbt
clean-targets:
- build
- dbt_modules
quoting:
database: true
schema: false
identifier: true
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order:
- airbyte_utils
- dbt_utils
vars:
json_column: _airbyte_data
models_to_source:
nested_stream_with_complex_columns_resulting_into_long_names_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_stg: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_scd: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
non_nested_stream_without_namespace_resulting_into_long_names_ab1: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names
non_nested_stream_without_namespace_resulting_into_long_names_ab2: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names
non_nested_stream_without_namespace_resulting_into_long_names_ab3: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names
non_nested_stream_without_namespace_resulting_into_long_names: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names
some_stream_that_was_empty_ab1: test_normalization._airbyte_raw_some_stream_that_was_empty
some_stream_that_was_empty_ab2: test_normalization._airbyte_raw_some_stream_that_was_empty
some_stream_that_was_empty_stg: test_normalization._airbyte_raw_some_stream_that_was_empty
some_stream_that_was_empty_scd: test_normalization._airbyte_raw_some_stream_that_was_empty
some_stream_that_was_empty: test_normalization._airbyte_raw_some_stream_that_was_empty
simple_stream_with_namespace_resulting_into_long_names_ab1: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names
simple_stream_with_namespace_resulting_into_long_names_ab2: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names
simple_stream_with_namespace_resulting_into_long_names_ab3: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names
simple_stream_with_namespace_resulting_into_long_names: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names
conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_scalar_ab1: test_normalization._airbyte_raw_conflict_stream_scalar
conflict_stream_scalar_ab2: test_normalization._airbyte_raw_conflict_stream_scalar
conflict_stream_scalar_ab3: test_normalization._airbyte_raw_conflict_stream_scalar
conflict_stream_scalar: test_normalization._airbyte_raw_conflict_stream_scalar
conflict_stream_array_ab1: test_normalization._airbyte_raw_conflict_stream_array
conflict_stream_array_ab2: test_normalization._airbyte_raw_conflict_stream_array
conflict_stream_array_ab3: test_normalization._airbyte_raw_conflict_stream_array
conflict_stream_array: test_normalization._airbyte_raw_conflict_stream_array
unnest_alias_ab1: test_normalization._airbyte_raw_unnest_alias
unnest_alias_ab2: test_normalization._airbyte_raw_unnest_alias
unnest_alias_ab3: test_normalization._airbyte_raw_unnest_alias
unnest_alias: test_normalization._airbyte_raw_unnest_alias
arrays_ab1: test_normalization._airbyte_raw_arrays
arrays_ab2: test_normalization._airbyte_raw_arrays
arrays_ab3: test_normalization._airbyte_raw_arrays
arrays: test_normalization._airbyte_raw_arrays
nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name
unnest_alias_children_ab1: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_ab2: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_ab3: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children: test_normalization._airbyte_raw_unnest_alias
arrays_nested_array_parent_ab1: test_normalization._airbyte_raw_arrays
arrays_nested_array_parent_ab2: test_normalization._airbyte_raw_arrays
arrays_nested_array_parent_ab3: test_normalization._airbyte_raw_arrays
arrays_nested_array_parent: test_normalization._airbyte_raw_arrays
nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
conflict_stream_name_conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name
conflict_stream_name_conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name
unnest_alias_children_owner_ab1: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_ab2: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_ab3: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_column___with__quotes_ab1: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_column___with__quotes_ab2: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_column___with__quotes_ab3: test_normalization._airbyte_raw_unnest_alias
unnest_alias_children_owner_column___with__quotes: test_normalization._airbyte_raw_unnest_alias

View File

@@ -0,0 +1,90 @@
create or replace table `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd`
partition by range_bucket(
_airbyte_active_row,
generate_array(0, 1, 1)
)
cluster by _airbyte_unique_key_scd, _airbyte_emitted_at
OPTIONS()
as (
-- depends_on: ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')
with
input_data as (
select *
from `dataline-integration-testing`._airbyte_test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_stg`
-- nested_stream_with_complex_columns_resulting_into_long_names from `dataline-integration-testing`.test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
),
scd_data as (
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), '')) as
string
))) as _airbyte_unique_key,
id,
date,
`partition`,
date as _airbyte_start_at,
lag(date) over (
partition by id
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) as _airbyte_end_at,
case when row_number() over (
partition by id
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) = 1 then 1 else 0 end as _airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from input_data
),
dedup_data as (
select
-- we need to ensure de-duplicated rows for merge/update queries
-- additionally, we generate a unique key for the scd table
row_number() over (
partition by
_airbyte_unique_key,
_airbyte_start_at,
_airbyte_emitted_at
order by _airbyte_active_row desc, _airbyte_ab_id
) as _airbyte_row_num,
to_hex(md5(cast(concat(coalesce(cast(_airbyte_unique_key as
string
), ''), '-', coalesce(cast(_airbyte_start_at as
string
), ''), '-', coalesce(cast(_airbyte_emitted_at as
string
), '')) as
string
))) as _airbyte_unique_key_scd,
scd_data.*
from scd_data
)
select
_airbyte_unique_key,
_airbyte_unique_key_scd,
id,
date,
`partition`,
_airbyte_start_at,
_airbyte_end_at,
_airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from dedup_data where _airbyte_row_num = 1
);

View File

@@ -0,0 +1,26 @@
create or replace table `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_unique_key, _airbyte_emitted_at
OPTIONS()
as (
-- Final base SQL model
-- depends_on: `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd`
select
_airbyte_unique_key,
id,
date,
`partition`,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd`
-- nested_stream_with_complex_columns_resulting_into_long_names from `dataline-integration-testing`.test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
where 1 = 1
and _airbyte_active_row = 1
);

View File

@@ -0,0 +1,74 @@
create or replace table `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_emitted_at
OPTIONS()
as (
with __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd`
select
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid,
json_extract_array(`partition`, "$['double_array_data']") as double_array_data,
json_extract_array(`partition`, "$['DATA']") as DATA,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd` as table_alias
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition
where 1 = 1
and `partition` is not null
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1
select
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid,
double_array_data,
DATA,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition
where 1 = 1
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3 as (
-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2
select
to_hex(md5(cast(concat(coalesce(cast(_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid as
string
), ''), '-', coalesce(cast(array_to_string(double_array_data, "|", "") as
string
), ''), '-', coalesce(cast(array_to_string(DATA, "|", "") as
string
), '')) as
string
))) as _airbyte_partition_hashid,
tmp.*
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2 tmp
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition
where 1 = 1
)-- Final base SQL model
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3
select
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid,
double_array_data,
DATA,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_partition_hashid
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd`
where 1 = 1
);

View File

@@ -0,0 +1,73 @@
create or replace table `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_emitted_at
OPTIONS()
as (
with __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition`
select
_airbyte_partition_hashid,
json_extract_scalar(DATA, "$['currency']") as currency,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition` as table_alias
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA
cross join unnest(DATA) as DATA
where 1 = 1
and DATA is not null
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1
select
_airbyte_partition_hashid,
cast(currency as
string
) as currency,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA
where 1 = 1
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3 as (
-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2
select
to_hex(md5(cast(concat(coalesce(cast(_airbyte_partition_hashid as
string
), ''), '-', coalesce(cast(currency as
string
), '')) as
string
))) as _airbyte_DATA_hashid,
tmp.*
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2 tmp
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA
where 1 = 1
)-- Final base SQL model
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3
select
_airbyte_partition_hashid,
currency,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_DATA_hashid
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition`
where 1 = 1
);

View File

@@ -0,0 +1,73 @@
create or replace table `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_emitted_at
OPTIONS()
as (
with __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition`
select
_airbyte_partition_hashid,
json_extract_scalar(double_array_data, "$['id']") as id,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition` as table_alias
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data
cross join unnest(double_array_data) as double_array_data
where 1 = 1
and double_array_data is not null
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1
select
_airbyte_partition_hashid,
cast(id as
string
) as id,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data
where 1 = 1
), __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3 as (
-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2
select
to_hex(md5(cast(concat(coalesce(cast(_airbyte_partition_hashid as
string
), ''), '-', coalesce(cast(id as
string
), '')) as
string
))) as _airbyte_double_array_data_hashid,
tmp.*
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2 tmp
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data
where 1 = 1
)-- Final base SQL model
-- depends_on: __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3
select
_airbyte_partition_hashid,
id,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_double_array_data_hashid
from __dbt__cte__nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition`
where 1 = 1
);

View File

@@ -0,0 +1,21 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }}
select
{{ json_extract_scalar('_airbyte_data', ['id'], ['id']) }} as id,
{{ json_extract_scalar('_airbyte_data', ['date'], ['date']) }} as date,
{{ json_extract('table_alias', '_airbyte_data', ['partition'], ['partition']) }} as {{ adapter.quote('partition') }},
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} as table_alias
-- nested_stream_with_complex_columns_resulting_into_long_names
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,21 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }}
select
cast(id as {{ dbt_utils.type_string() }}) as id,
cast(date as {{ dbt_utils.type_string() }}) as date,
cast({{ adapter.quote('partition') }} as {{ type_json() }}) as {{ adapter.quote('partition') }},
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }}
-- nested_stream_with_complex_columns_resulting_into_long_names
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,22 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "_airbyte_test_normalization",
tags = [ "nested-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }}
{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'DATA') }}
select
_airbyte_partition_hashid,
{{ json_extract_scalar(unnested_column_value('DATA'), ['currency'], ['currency']) }} as currency,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA
{{ cross_join_unnest('partition', 'DATA') }}
where 1 = 1
and DATA is not null
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,21 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "_airbyte_test_normalization",
tags = [ "nested-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }}
select
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid,
{{ json_extract_array(adapter.quote('partition'), ['double_array_data'], ['double_array_data']) }} as double_array_data,
{{ json_extract_array(adapter.quote('partition'), ['DATA'], ['DATA']) }} as DATA,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} as table_alias
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition
where 1 = 1
and {{ adapter.quote('partition') }} is not null
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,22 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "_airbyte_test_normalization",
tags = [ "nested-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }}
{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'double_array_data') }}
select
_airbyte_partition_hashid,
{{ json_extract_scalar(unnested_column_value('double_array_data'), ['id'], ['id']) }} as id,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data
{{ cross_join_unnest('partition', 'double_array_data') }}
where 1 = 1
and double_array_data is not null
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,164 @@
{{ config(
cluster_by = ["_airbyte_unique_key_scd","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_active_row", "data_type": "int64", "range": {"start": 0, "end": 1, "interval": 1}},
unique_key = "_airbyte_unique_key_scd",
schema = "test_normalization",
post_hook = ["
{%
set final_table_relation = adapter.get_relation(
database=this.database,
schema=this.schema,
identifier='nested_stream_with_complex_columns_resulting_into_long_names'
)
%}
{#
If the final table doesn't exist, then obviously we can't delete anything from it.
Also, after a reset, the final table is created without the _airbyte_unique_key column (this column is created during the first sync)
So skip this deletion if the column doesn't exist. (in this case, the table is guaranteed to be empty anyway)
#}
{%
if final_table_relation is not none and '_airbyte_unique_key' in adapter.get_columns_in_relation(final_table_relation)|map(attribute='name')
%}
-- Delete records which are no longer active:
-- This query is equivalent, but the left join version is more performant:
-- delete from final_table where unique_key in (
-- select unique_key from scd_table where 1 = 1 <incremental_clause(normalized_at, final_table)>
-- ) and unique_key not in (
-- select unique_key from scd_table where active_row = 1 <incremental_clause(normalized_at, final_table)>
-- )
-- We're incremental against normalized_at rather than emitted_at because we need to fetch the SCD
-- entries that were _updated_ recently. This is because a deleted record will have an SCD record
-- which was emitted a long time ago, but recently re-normalized to have active_row = 0.
delete from {{ final_table_relation }} final_table where final_table._airbyte_unique_key in (
select recent_records.unique_key
from (
select distinct _airbyte_unique_key as unique_key
from {{ this }}
where 1=1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }}
) recent_records
left join (
select _airbyte_unique_key as unique_key, count(_airbyte_unique_key) as active_count
from {{ this }}
where _airbyte_active_row = 1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }}
group by _airbyte_unique_key
) active_counts
on recent_records.unique_key = active_counts.unique_key
where active_count is null or active_count = 0
)
{% else %}
-- We have to have a non-empty query, so just do a noop delete
delete from {{ this }} where 1=0
{% endif %}
","drop view _airbyte_test_normalization.nested_stream_with_complex_columns_resulting_into_long_names_stg"],
tags = [ "top-level" ]
) }}
-- depends_on: ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')
with
{% if is_incremental() %}
new_data as (
-- retrieve incremental "new" data
select
*
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }}
-- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}
),
new_data_ids as (
-- build a subset of _airbyte_unique_key from rows that are new
select distinct
{{ dbt_utils.surrogate_key([
'id',
]) }} as _airbyte_unique_key
from new_data
),
empty_new_data as (
-- build an empty table to only keep the table's column types
select * from new_data where 1 = 0
),
previous_active_scd_data as (
-- retrieve "incomplete old" data that needs to be updated with an end date because of new changes
select
{{ star_intersect(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg'), this, from_alias='inc_data', intersect_alias='this_data') }}
from {{ this }} as this_data
-- make a join with new_data using primary key to filter active data that need to be updated only
join new_data_ids on this_data._airbyte_unique_key = new_data_ids._airbyte_unique_key
-- force left join to NULL values (we just need to transfer column types only for the star_intersect macro on schema changes)
left join empty_new_data as inc_data on this_data._airbyte_ab_id = inc_data._airbyte_ab_id
where _airbyte_active_row = 1
),
input_data as (
select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from new_data
union all
select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from previous_active_scd_data
),
{% else %}
input_data as (
select *
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }}
-- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }}
),
{% endif %}
scd_data as (
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
{{ dbt_utils.surrogate_key([
'id',
]) }} as _airbyte_unique_key,
id,
date,
{{ adapter.quote('partition') }},
date as _airbyte_start_at,
lag(date) over (
partition by id
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) as _airbyte_end_at,
case when row_number() over (
partition by id
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) = 1 then 1 else 0 end as _airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from input_data
),
dedup_data as (
select
-- we need to ensure de-duplicated rows for merge/update queries
-- additionally, we generate a unique key for the scd table
row_number() over (
partition by
_airbyte_unique_key,
_airbyte_start_at,
_airbyte_emitted_at
order by _airbyte_active_row desc, _airbyte_ab_id
) as _airbyte_row_num,
{{ dbt_utils.surrogate_key([
'_airbyte_unique_key',
'_airbyte_start_at',
'_airbyte_emitted_at'
]) }} as _airbyte_unique_key_scd,
scd_data.*
from scd_data
)
select
_airbyte_unique_key,
_airbyte_unique_key_scd,
id,
date,
{{ adapter.quote('partition') }},
_airbyte_start_at,
_airbyte_end_at,
_airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from dedup_data where _airbyte_row_num = 1

View File

@@ -0,0 +1,24 @@
{{ config(
cluster_by = ["_airbyte_unique_key","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = "_airbyte_unique_key",
schema = "test_normalization",
tags = [ "top-level" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }}
select
_airbyte_unique_key,
id,
date,
{{ adapter.quote('partition') }},
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }}
-- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }}
where 1 = 1
and _airbyte_active_row = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,21 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "test_normalization",
tags = [ "nested" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }}
select
_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid,
double_array_data,
DATA,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_partition_hashid
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }}
-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,20 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "test_normalization",
tags = [ "nested" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }}
select
_airbyte_partition_hashid,
currency,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_DATA_hashid
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }}
-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,20 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
schema = "test_normalization",
tags = [ "nested" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }}
select
_airbyte_partition_hashid,
id,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_double_array_data_hashid
from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }}
-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,23 @@
version: 2
sources:
- name: test_normalization
quoting:
database: true
schema: false
identifier: false
tables:
- name: _airbyte_raw_arrays
- name: _airbyte_raw_conflict_stream_array
- name: _airbyte_raw_conflict_stream_name
- name: _airbyte_raw_conflict_stream_scalar
- name: _airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names
- name: _airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names
- name: _airbyte_raw_some_stream_that_was_empty
- name: _airbyte_raw_unnest_alias
- name: test_normalization_namespace
quoting:
database: true
schema: false
identifier: false
tables:
- name: _airbyte_raw_simple_stream_with_namespace_resulting_into_long_names

View File

@@ -0,0 +1,27 @@
merge into `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_scd__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on
DBT_INTERNAL_SOURCE._airbyte_unique_key_scd = DBT_INTERNAL_DEST._airbyte_unique_key_scd
when matched then update set
`_airbyte_unique_key` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key`,`_airbyte_unique_key_scd` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key_scd`,`id` = DBT_INTERNAL_SOURCE.`id`,`date` = DBT_INTERNAL_SOURCE.`date`,`partition` = DBT_INTERNAL_SOURCE.`partition`,`_airbyte_start_at` = DBT_INTERNAL_SOURCE.`_airbyte_start_at`,`_airbyte_end_at` = DBT_INTERNAL_SOURCE.`_airbyte_end_at`,`_airbyte_active_row` = DBT_INTERNAL_SOURCE.`_airbyte_active_row`,`_airbyte_ab_id` = DBT_INTERNAL_SOURCE.`_airbyte_ab_id`,`_airbyte_emitted_at` = DBT_INTERNAL_SOURCE.`_airbyte_emitted_at`,`_airbyte_normalized_at` = DBT_INTERNAL_SOURCE.`_airbyte_normalized_at`,`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid` = DBT_INTERNAL_SOURCE.`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`
when not matched then insert
(`_airbyte_unique_key`, `_airbyte_unique_key_scd`, `id`, `date`, `partition`, `_airbyte_start_at`, `_airbyte_end_at`, `_airbyte_active_row`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`)
values
(`_airbyte_unique_key`, `_airbyte_unique_key_scd`, `id`, `date`, `partition`, `_airbyte_start_at`, `_airbyte_end_at`, `_airbyte_active_row`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`)

View File

@@ -0,0 +1,27 @@
merge into `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on
DBT_INTERNAL_SOURCE._airbyte_unique_key = DBT_INTERNAL_DEST._airbyte_unique_key
when matched then update set
`_airbyte_unique_key` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key`,`id` = DBT_INTERNAL_SOURCE.`id`,`date` = DBT_INTERNAL_SOURCE.`date`,`partition` = DBT_INTERNAL_SOURCE.`partition`,`_airbyte_ab_id` = DBT_INTERNAL_SOURCE.`_airbyte_ab_id`,`_airbyte_emitted_at` = DBT_INTERNAL_SOURCE.`_airbyte_emitted_at`,`_airbyte_normalized_at` = DBT_INTERNAL_SOURCE.`_airbyte_normalized_at`,`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid` = DBT_INTERNAL_SOURCE.`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`
when not matched then insert
(`_airbyte_unique_key`, `id`, `date`, `partition`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`)
values
(`_airbyte_unique_key`, `id`, `date`, `partition`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`)

View File

@@ -0,0 +1,21 @@
merge into `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on FALSE
when not matched then insert
(`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`, `double_array_data`, `DATA`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_partition_hashid`)
values
(`_airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid`, `double_array_data`, `DATA`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_partition_hashid`)

View File

@@ -0,0 +1,21 @@
merge into `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on FALSE
when not matched then insert
(`_airbyte_partition_hashid`, `currency`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_DATA_hashid`)
values
(`_airbyte_partition_hashid`, `currency`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_DATA_hashid`)

View File

@@ -0,0 +1,21 @@
merge into `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on FALSE
when not matched then insert
(`_airbyte_partition_hashid`, `id`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_double_array_data_hashid`)
values
(`_airbyte_partition_hashid`, `id`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_double_array_data_hashid`)

View File

@@ -0,0 +1,70 @@
name: airbyte_utils
version: '1.0'
config-version: 2
profile: normalize
model-paths:
- modified_models
docs-paths:
- docs
analysis-paths:
- analysis
test-paths:
- tests
seed-paths:
- data
macro-paths:
- macros
target-path: ../build
log-path: ../logs
packages-install-path: /dbt
clean-targets:
- build
- dbt_modules
quoting:
database: true
schema: false
identifier: true
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order:
- airbyte_utils
- dbt_utils
vars:
json_column: _airbyte_data
models_to_source:
exchange_rate_ab1: test_normalization._airbyte_raw_exchange_rate
exchange_rate_ab2: test_normalization._airbyte_raw_exchange_rate
exchange_rate_ab3: test_normalization._airbyte_raw_exchange_rate
exchange_rate: test_normalization._airbyte_raw_exchange_rate
dedup_exchange_rate_ab1: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_ab2: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_stg: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_scd: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate: test_normalization._airbyte_raw_dedup_exchange_rate
renamed_dedup_cdc_excluded_ab1: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_ab2: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_stg: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_scd: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
dedup_cdc_excluded_ab1: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_ab2: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_stg: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_scd: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded: test_normalization._airbyte_raw_dedup_cdc_excluded

View File

@@ -0,0 +1,90 @@
name: airbyte_utils
version: '1.0'
config-version: 2
profile: normalize
model-paths:
- models
docs-paths:
- docs
analysis-paths:
- analysis
test-paths:
- tests
seed-paths:
- data
macro-paths:
- macros
target-path: ../build
log-path: ../logs
packages-install-path: /dbt
clean-targets:
- build
- dbt_modules
quoting:
database: true
schema: false
identifier: true
models:
airbyte_utils:
+materialized: table
generated:
airbyte_ctes:
+tags: airbyte_internal_cte
+materialized: ephemeral
airbyte_incremental:
+tags: incremental_tables
+materialized: incremental
+on_schema_change: sync_all_columns
airbyte_tables:
+tags: normalized_tables
+materialized: table
airbyte_views:
+tags: airbyte_internal_views
+materialized: view
dispatch:
- macro_namespace: dbt_utils
search_order:
- airbyte_utils
- dbt_utils
vars:
json_column: _airbyte_data
models_to_source:
exchange_rate_ab1: test_normalization._airbyte_raw_exchange_rate
exchange_rate_ab2: test_normalization._airbyte_raw_exchange_rate
exchange_rate_ab3: test_normalization._airbyte_raw_exchange_rate
exchange_rate: test_normalization._airbyte_raw_exchange_rate
dedup_exchange_rate_ab1: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_ab2: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_stg: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate_scd: test_normalization._airbyte_raw_dedup_exchange_rate
dedup_exchange_rate: test_normalization._airbyte_raw_dedup_exchange_rate
renamed_dedup_cdc_excluded_ab1: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_ab2: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_stg: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded_scd: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
renamed_dedup_cdc_excluded: test_normalization._airbyte_raw_renamed_dedup_cdc_excluded
dedup_cdc_excluded_ab1: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_ab2: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_stg: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded_scd: test_normalization._airbyte_raw_dedup_cdc_excluded
dedup_cdc_excluded: test_normalization._airbyte_raw_dedup_cdc_excluded
pos_dedup_cdcx_ab1: test_normalization._airbyte_raw_pos_dedup_cdcx
pos_dedup_cdcx_ab2: test_normalization._airbyte_raw_pos_dedup_cdcx
pos_dedup_cdcx_stg: test_normalization._airbyte_raw_pos_dedup_cdcx
pos_dedup_cdcx_scd: test_normalization._airbyte_raw_pos_dedup_cdcx
pos_dedup_cdcx: test_normalization._airbyte_raw_pos_dedup_cdcx
1_prefix_startwith_number_ab1: test_normalization._airbyte_raw_1_prefix_startwith_number
1_prefix_startwith_number_ab2: test_normalization._airbyte_raw_1_prefix_startwith_number
1_prefix_startwith_number_stg: test_normalization._airbyte_raw_1_prefix_startwith_number
1_prefix_startwith_number_scd: test_normalization._airbyte_raw_1_prefix_startwith_number
1_prefix_startwith_number: test_normalization._airbyte_raw_1_prefix_startwith_number
multiple_column_names_conflicts_ab1: test_normalization._airbyte_raw_multiple_column_names_conflicts
multiple_column_names_conflicts_ab2: test_normalization._airbyte_raw_multiple_column_names_conflicts
multiple_column_names_conflicts_stg: test_normalization._airbyte_raw_multiple_column_names_conflicts
multiple_column_names_conflicts_scd: test_normalization._airbyte_raw_multiple_column_names_conflicts
multiple_column_names_conflicts: test_normalization._airbyte_raw_multiple_column_names_conflicts
types_testing_ab1: test_normalization._airbyte_raw_types_testing
types_testing_ab2: test_normalization._airbyte_raw_types_testing
types_testing_stg: test_normalization._airbyte_raw_types_testing
types_testing_scd: test_normalization._airbyte_raw_types_testing
types_testing: test_normalization._airbyte_raw_types_testing

View File

@@ -0,0 +1,108 @@
create or replace table `dataline-integration-testing`.test_normalization.`dedup_exchange_rate_scd`
partition by range_bucket(
_airbyte_active_row,
generate_array(0, 1, 1)
)
cluster by _airbyte_unique_key_scd, _airbyte_emitted_at
OPTIONS()
as (
-- depends_on: ref('dedup_exchange_rate_stg')
with
input_data as (
select *
from `dataline-integration-testing`._airbyte_test_normalization.`dedup_exchange_rate_stg`
-- dedup_exchange_rate from `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate
),
scd_data as (
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(currency as
string
), ''), '-', coalesce(cast(NZD as
string
), '')) as
string
))) as _airbyte_unique_key,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
date as _airbyte_start_at,
lag(date) over (
partition by id, currency, cast(NZD as
string
)
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) as _airbyte_end_at,
case when row_number() over (
partition by id, currency, cast(NZD as
string
)
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) = 1 then 1 else 0 end as _airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
_airbyte_dedup_exchange_rate_hashid
from input_data
),
dedup_data as (
select
-- we need to ensure de-duplicated rows for merge/update queries
-- additionally, we generate a unique key for the scd table
row_number() over (
partition by
_airbyte_unique_key,
_airbyte_start_at,
_airbyte_emitted_at
order by _airbyte_active_row desc, _airbyte_ab_id
) as _airbyte_row_num,
to_hex(md5(cast(concat(coalesce(cast(_airbyte_unique_key as
string
), ''), '-', coalesce(cast(_airbyte_start_at as
string
), ''), '-', coalesce(cast(_airbyte_emitted_at as
string
), '')) as
string
))) as _airbyte_unique_key_scd,
scd_data.*
from scd_data
)
select
_airbyte_unique_key,
_airbyte_unique_key_scd,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
_airbyte_start_at,
_airbyte_end_at,
_airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from dedup_data where _airbyte_row_num = 1
);

View File

@@ -0,0 +1,31 @@
create or replace table `dataline-integration-testing`.test_normalization.`dedup_exchange_rate`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_unique_key, _airbyte_emitted_at
OPTIONS()
as (
-- Final base SQL model
-- depends_on: `dataline-integration-testing`.test_normalization.`dedup_exchange_rate_scd`
select
_airbyte_unique_key,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from `dataline-integration-testing`.test_normalization.`dedup_exchange_rate_scd`
-- dedup_exchange_rate from `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate
where 1 = 1
and _airbyte_active_row = 1
);

View File

@@ -0,0 +1,145 @@
create or replace table `dataline-integration-testing`.test_normalization.`exchange_rate`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_emitted_at
OPTIONS()
as (
with __dbt__cte__exchange_rate_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['currency']") as currency,
json_extract_scalar(_airbyte_data, "$['date']") as date,
json_extract_scalar(_airbyte_data, "$['timestamp_col']") as timestamp_col,
json_extract_scalar(_airbyte_data, "$['HKD@spéçiäl & characters']") as HKD_special___characters,
json_extract_scalar(_airbyte_data, "$['HKD_special___characters']") as HKD_special___characters_1,
json_extract_scalar(_airbyte_data, "$['NZD']") as NZD,
json_extract_scalar(_airbyte_data, "$['USD']") as USD,
json_extract_scalar(_airbyte_data, "$['column___with__quotes']") as column___with__quotes,
json_extract_scalar(_airbyte_data, "$['datetime_tz']") as datetime_tz,
json_extract_scalar(_airbyte_data, "$['datetime_no_tz']") as datetime_no_tz,
json_extract_scalar(_airbyte_data, "$['time_tz']") as time_tz,
json_extract_scalar(_airbyte_data, "$['time_no_tz']") as time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate as table_alias
-- exchange_rate
where 1 = 1
), __dbt__cte__exchange_rate_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__exchange_rate_ab1
select
cast(id as
int64
) as id,
cast(currency as
string
) as currency,
cast(nullif(date, '') as
date
) as date,
cast(nullif(timestamp_col, '') as
timestamp
) as timestamp_col,
cast(HKD_special___characters as
float64
) as HKD_special___characters,
cast(HKD_special___characters_1 as
string
) as HKD_special___characters_1,
cast(NZD as
float64
) as NZD,
cast(USD as
float64
) as USD,
cast(column___with__quotes as
string
) as column___with__quotes,
cast(nullif(datetime_tz, '') as
timestamp
) as datetime_tz,
cast(nullif(datetime_no_tz, '') as
datetime
) as datetime_no_tz,
cast(nullif(time_tz, '') as
STRING
) as time_tz,
cast(nullif(time_no_tz, '') as
time
) as time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__exchange_rate_ab1
-- exchange_rate
where 1 = 1
), __dbt__cte__exchange_rate_ab3 as (
-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__exchange_rate_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(currency as
string
), ''), '-', coalesce(cast(date as
string
), ''), '-', coalesce(cast(timestamp_col as
string
), ''), '-', coalesce(cast(HKD_special___characters as
string
), ''), '-', coalesce(cast(HKD_special___characters_1 as
string
), ''), '-', coalesce(cast(NZD as
string
), ''), '-', coalesce(cast(USD as
string
), ''), '-', coalesce(cast(column___with__quotes as
string
), ''), '-', coalesce(cast(datetime_tz as
string
), ''), '-', coalesce(cast(datetime_no_tz as
string
), ''), '-', coalesce(cast(time_tz as
string
), ''), '-', coalesce(cast(time_no_tz as
string
), '')) as
string
))) as _airbyte_exchange_rate_hashid,
tmp.*
from __dbt__cte__exchange_rate_ab2 tmp
-- exchange_rate
where 1 = 1
)-- Final base SQL model
-- depends_on: __dbt__cte__exchange_rate_ab3
select
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
column___with__quotes,
datetime_tz,
datetime_no_tz,
time_tz,
time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_exchange_rate_hashid
from __dbt__cte__exchange_rate_ab3
-- exchange_rate from `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate
where 1 = 1
);

View File

@@ -0,0 +1,89 @@
create or replace view `dataline-integration-testing`._airbyte_test_normalization.`dedup_exchange_rate_stg`
OPTIONS()
as
with __dbt__cte__dedup_exchange_rate_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['currency']") as currency,
json_extract_scalar(_airbyte_data, "$['date']") as date,
json_extract_scalar(_airbyte_data, "$['timestamp_col']") as timestamp_col,
json_extract_scalar(_airbyte_data, "$['HKD@spéçiäl & characters']") as HKD_special___characters,
json_extract_scalar(_airbyte_data, "$['HKD_special___characters']") as HKD_special___characters_1,
json_extract_scalar(_airbyte_data, "$['NZD']") as NZD,
json_extract_scalar(_airbyte_data, "$['USD']") as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate as table_alias
-- dedup_exchange_rate
where 1 = 1
), __dbt__cte__dedup_exchange_rate_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__dedup_exchange_rate_ab1
select
cast(id as
int64
) as id,
cast(currency as
string
) as currency,
cast(nullif(date, '') as
date
) as date,
cast(nullif(timestamp_col, '') as
timestamp
) as timestamp_col,
cast(HKD_special___characters as
float64
) as HKD_special___characters,
cast(HKD_special___characters_1 as
string
) as HKD_special___characters_1,
cast(NZD as
float64
) as NZD,
cast(USD as
float64
) as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__dedup_exchange_rate_ab1
-- dedup_exchange_rate
where 1 = 1
)-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__dedup_exchange_rate_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(currency as
string
), ''), '-', coalesce(cast(date as
string
), ''), '-', coalesce(cast(timestamp_col as
string
), ''), '-', coalesce(cast(HKD_special___characters as
string
), ''), '-', coalesce(cast(HKD_special___characters_1 as
string
), ''), '-', coalesce(cast(NZD as
string
), ''), '-', coalesce(cast(USD as
string
), '')) as
string
))) as _airbyte_dedup_exchange_rate_hashid,
tmp.*
from __dbt__cte__dedup_exchange_rate_ab2 tmp
-- dedup_exchange_rate
where 1 = 1
;

View File

@@ -0,0 +1,83 @@
create or replace view `dataline-integration-testing`._airbyte_test_normalization.`multiple_column_names_conflicts_stg`
OPTIONS()
as
with __dbt__cte__multiple_column_names_conflicts_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_multiple_column_names_conflicts
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['User Id']") as User_Id,
json_extract_scalar(_airbyte_data, "$['user_id']") as user_id_1,
json_extract_scalar(_airbyte_data, "$['User id']") as User_id_2,
json_extract_scalar(_airbyte_data, "$['user id']") as user_id_3,
json_extract_scalar(_airbyte_data, "$['User@Id']") as User_Id_4,
json_extract_scalar(_airbyte_data, "$['UserId']") as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_multiple_column_names_conflicts as table_alias
-- multiple_column_names_conflicts
where 1 = 1
), __dbt__cte__multiple_column_names_conflicts_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab1
select
cast(id as
int64
) as id,
cast(User_Id as
string
) as User_Id,
cast(user_id_1 as
float64
) as user_id_1,
cast(User_id_2 as
float64
) as User_id_2,
cast(user_id_3 as
float64
) as user_id_3,
cast(User_Id_4 as
string
) as User_Id_4,
cast(UserId as
float64
) as UserId,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__multiple_column_names_conflicts_ab1
-- multiple_column_names_conflicts
where 1 = 1
)-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__multiple_column_names_conflicts_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(User_Id as
string
), ''), '-', coalesce(cast(user_id_1 as
string
), ''), '-', coalesce(cast(User_id_2 as
string
), ''), '-', coalesce(cast(user_id_3 as
string
), ''), '-', coalesce(cast(User_Id_4 as
string
), ''), '-', coalesce(cast(UserId as
string
), '')) as
string
))) as _airbyte_multiple_column_names_conflicts_hashid,
tmp.*
from __dbt__cte__multiple_column_names_conflicts_ab2 tmp
-- multiple_column_names_conflicts
where 1 = 1
;

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
select
{{ json_extract_scalar('_airbyte_data', ['id'], ['id']) }} as id,
{{ json_extract_scalar('_airbyte_data', ['currency'], ['currency']) }} as currency,
{{ json_extract_scalar('_airbyte_data', ['date'], ['date']) }} as date,
{{ json_extract_scalar('_airbyte_data', ['timestamp_col'], ['timestamp_col']) }} as timestamp_col,
{{ json_extract_scalar('_airbyte_data', ['HKD@spéçiäl & characters'], ['HKD@spéçiäl & characters']) }} as HKD_special___characters,
{{ json_extract_scalar('_airbyte_data', ['HKD_special___characters'], ['HKD_special___characters']) }} as HKD_special___characters_1,
{{ json_extract_scalar('_airbyte_data', ['NZD'], ['NZD']) }} as NZD,
{{ json_extract_scalar('_airbyte_data', ['USD'], ['USD']) }} as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }} as table_alias
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: {{ ref('dedup_exchange_rate_ab1') }}
select
cast(id as {{ dbt_utils.type_bigint() }}) as id,
cast(currency as {{ dbt_utils.type_string() }}) as currency,
cast({{ empty_string_to_null('date') }} as {{ type_date() }}) as date,
cast({{ empty_string_to_null('timestamp_col') }} as {{ type_timestamp_with_timezone() }}) as timestamp_col,
cast(HKD_special___characters as {{ dbt_utils.type_float() }}) as HKD_special___characters,
cast(HKD_special___characters_1 as {{ dbt_utils.type_string() }}) as HKD_special___characters_1,
cast(NZD as {{ dbt_utils.type_float() }}) as NZD,
cast(USD as {{ dbt_utils.type_float() }}) as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('dedup_exchange_rate_ab1') }}
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,178 @@
{{ config(
cluster_by = ["_airbyte_unique_key_scd","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_active_row", "data_type": "int64", "range": {"start": 0, "end": 1, "interval": 1}},
unique_key = "_airbyte_unique_key_scd",
schema = "test_normalization",
post_hook = ["
{%
set final_table_relation = adapter.get_relation(
database=this.database,
schema=this.schema,
identifier='dedup_exchange_rate'
)
%}
{#
If the final table doesn't exist, then obviously we can't delete anything from it.
Also, after a reset, the final table is created without the _airbyte_unique_key column (this column is created during the first sync)
So skip this deletion if the column doesn't exist. (in this case, the table is guaranteed to be empty anyway)
#}
{%
if final_table_relation is not none and '_airbyte_unique_key' in adapter.get_columns_in_relation(final_table_relation)|map(attribute='name')
%}
-- Delete records which are no longer active:
-- This query is equivalent, but the left join version is more performant:
-- delete from final_table where unique_key in (
-- select unique_key from scd_table where 1 = 1 <incremental_clause(normalized_at, final_table)>
-- ) and unique_key not in (
-- select unique_key from scd_table where active_row = 1 <incremental_clause(normalized_at, final_table)>
-- )
-- We're incremental against normalized_at rather than emitted_at because we need to fetch the SCD
-- entries that were _updated_ recently. This is because a deleted record will have an SCD record
-- which was emitted a long time ago, but recently re-normalized to have active_row = 0.
delete from {{ final_table_relation }} final_table where final_table._airbyte_unique_key in (
select recent_records.unique_key
from (
select distinct _airbyte_unique_key as unique_key
from {{ this }}
where 1=1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('dedup_exchange_rate')) }}
) recent_records
left join (
select _airbyte_unique_key as unique_key, count(_airbyte_unique_key) as active_count
from {{ this }}
where _airbyte_active_row = 1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('dedup_exchange_rate')) }}
group by _airbyte_unique_key
) active_counts
on recent_records.unique_key = active_counts.unique_key
where active_count is null or active_count = 0
)
{% else %}
-- We have to have a non-empty query, so just do a noop delete
delete from {{ this }} where 1=0
{% endif %}
","drop view _airbyte_test_normalization.dedup_exchange_rate_stg"],
tags = [ "top-level" ]
) }}
-- depends_on: ref('dedup_exchange_rate_stg')
with
{% if is_incremental() %}
new_data as (
-- retrieve incremental "new" data
select
*
from {{ ref('dedup_exchange_rate_stg') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}
),
new_data_ids as (
-- build a subset of _airbyte_unique_key from rows that are new
select distinct
{{ dbt_utils.surrogate_key([
'id',
'currency',
'NZD',
]) }} as _airbyte_unique_key
from new_data
),
empty_new_data as (
-- build an empty table to only keep the table's column types
select * from new_data where 1 = 0
),
previous_active_scd_data as (
-- retrieve "incomplete old" data that needs to be updated with an end date because of new changes
select
{{ star_intersect(ref('dedup_exchange_rate_stg'), this, from_alias='inc_data', intersect_alias='this_data') }}
from {{ this }} as this_data
-- make a join with new_data using primary key to filter active data that need to be updated only
join new_data_ids on this_data._airbyte_unique_key = new_data_ids._airbyte_unique_key
-- force left join to NULL values (we just need to transfer column types only for the star_intersect macro on schema changes)
left join empty_new_data as inc_data on this_data._airbyte_ab_id = inc_data._airbyte_ab_id
where _airbyte_active_row = 1
),
input_data as (
select {{ dbt_utils.star(ref('dedup_exchange_rate_stg')) }} from new_data
union all
select {{ dbt_utils.star(ref('dedup_exchange_rate_stg')) }} from previous_active_scd_data
),
{% else %}
input_data as (
select *
from {{ ref('dedup_exchange_rate_stg') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
),
{% endif %}
scd_data as (
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
{{ dbt_utils.surrogate_key([
'id',
'currency',
'NZD',
]) }} as _airbyte_unique_key,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
date as _airbyte_start_at,
lag(date) over (
partition by id, currency, cast(NZD as {{ dbt_utils.type_string() }})
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) as _airbyte_end_at,
case when row_number() over (
partition by id, currency, cast(NZD as {{ dbt_utils.type_string() }})
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) = 1 then 1 else 0 end as _airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
_airbyte_dedup_exchange_rate_hashid
from input_data
),
dedup_data as (
select
-- we need to ensure de-duplicated rows for merge/update queries
-- additionally, we generate a unique key for the scd table
row_number() over (
partition by
_airbyte_unique_key,
_airbyte_start_at,
_airbyte_emitted_at
order by _airbyte_active_row desc, _airbyte_ab_id
) as _airbyte_row_num,
{{ dbt_utils.surrogate_key([
'_airbyte_unique_key',
'_airbyte_start_at',
'_airbyte_emitted_at'
]) }} as _airbyte_unique_key_scd,
scd_data.*
from scd_data
)
select
_airbyte_unique_key,
_airbyte_unique_key_scd,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
_airbyte_start_at,
_airbyte_end_at,
_airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from dedup_data where _airbyte_row_num = 1

View File

@@ -0,0 +1,29 @@
{{ config(
cluster_by = ["_airbyte_unique_key","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = "_airbyte_unique_key",
schema = "test_normalization",
tags = [ "top-level" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('dedup_exchange_rate_scd') }}
select
_airbyte_unique_key,
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from {{ ref('dedup_exchange_rate_scd') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
where 1 = 1
and _airbyte_active_row = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,31 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "test_normalization",
tags = [ "top-level" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('exchange_rate_ab3') }}
select
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
column___with__quotes,
datetime_tz,
datetime_no_tz,
time_tz,
time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_exchange_rate_hashid
from {{ ref('exchange_rate_ab3') }}
-- exchange_rate from {{ source('test_normalization', '_airbyte_raw_exchange_rate') }}
where 1 = 1

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to build a hash column based on the values of this record
-- depends_on: {{ ref('dedup_exchange_rate_ab2') }}
select
{{ dbt_utils.surrogate_key([
'id',
'currency',
'date',
'timestamp_col',
'HKD_special___characters',
'HKD_special___characters_1',
'NZD',
'USD',
]) }} as _airbyte_dedup_exchange_rate_hashid,
tmp.*
from {{ ref('dedup_exchange_rate_ab2') }} tmp
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,16 @@
version: 2
sources:
- name: test_normalization
quoting:
database: true
schema: false
identifier: false
tables:
- name: _airbyte_raw_1_prefix_startwith_number
- name: _airbyte_raw_dedup_cdc_excluded
- name: _airbyte_raw_dedup_exchange_rate
- name: _airbyte_raw_exchange_rate
- name: _airbyte_raw_multiple_column_names_conflicts
- name: _airbyte_raw_pos_dedup_cdcx
- name: _airbyte_raw_renamed_dedup_cdc_excluded
- name: _airbyte_raw_types_testing

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
select
{{ json_extract_scalar('_airbyte_data', ['id'], ['id']) }} as id,
{{ json_extract_scalar('_airbyte_data', ['currency'], ['currency']) }} as currency,
{{ json_extract_scalar('_airbyte_data', ['new_column'], ['new_column']) }} as new_column,
{{ json_extract_scalar('_airbyte_data', ['date'], ['date']) }} as date,
{{ json_extract_scalar('_airbyte_data', ['timestamp_col'], ['timestamp_col']) }} as timestamp_col,
{{ json_extract_scalar('_airbyte_data', ['HKD@spéçiäl & characters'], ['HKD@spéçiäl & characters']) }} as HKD_special___characters,
{{ json_extract_scalar('_airbyte_data', ['NZD'], ['NZD']) }} as NZD,
{{ json_extract_scalar('_airbyte_data', ['USD'], ['USD']) }} as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }} as table_alias
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: {{ ref('dedup_exchange_rate_ab1') }}
select
cast(id as {{ dbt_utils.type_float() }}) as id,
cast(currency as {{ dbt_utils.type_string() }}) as currency,
cast(new_column as {{ dbt_utils.type_float() }}) as new_column,
cast({{ empty_string_to_null('date') }} as {{ type_date() }}) as date,
cast({{ empty_string_to_null('timestamp_col') }} as {{ type_timestamp_with_timezone() }}) as timestamp_col,
cast(HKD_special___characters as {{ dbt_utils.type_float() }}) as HKD_special___characters,
cast(NZD as {{ dbt_utils.type_float() }}) as NZD,
cast(USD as {{ dbt_utils.type_bigint() }}) as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at
from {{ ref('dedup_exchange_rate_ab1') }}
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,178 @@
{{ config(
cluster_by = ["_airbyte_unique_key_scd","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_active_row", "data_type": "int64", "range": {"start": 0, "end": 1, "interval": 1}},
unique_key = "_airbyte_unique_key_scd",
schema = "test_normalization",
post_hook = ["
{%
set final_table_relation = adapter.get_relation(
database=this.database,
schema=this.schema,
identifier='dedup_exchange_rate'
)
%}
{#
If the final table doesn't exist, then obviously we can't delete anything from it.
Also, after a reset, the final table is created without the _airbyte_unique_key column (this column is created during the first sync)
So skip this deletion if the column doesn't exist. (in this case, the table is guaranteed to be empty anyway)
#}
{%
if final_table_relation is not none and '_airbyte_unique_key' in adapter.get_columns_in_relation(final_table_relation)|map(attribute='name')
%}
-- Delete records which are no longer active:
-- This query is equivalent, but the left join version is more performant:
-- delete from final_table where unique_key in (
-- select unique_key from scd_table where 1 = 1 <incremental_clause(normalized_at, final_table)>
-- ) and unique_key not in (
-- select unique_key from scd_table where active_row = 1 <incremental_clause(normalized_at, final_table)>
-- )
-- We're incremental against normalized_at rather than emitted_at because we need to fetch the SCD
-- entries that were _updated_ recently. This is because a deleted record will have an SCD record
-- which was emitted a long time ago, but recently re-normalized to have active_row = 0.
delete from {{ final_table_relation }} final_table where final_table._airbyte_unique_key in (
select recent_records.unique_key
from (
select distinct _airbyte_unique_key as unique_key
from {{ this }}
where 1=1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('dedup_exchange_rate')) }}
) recent_records
left join (
select _airbyte_unique_key as unique_key, count(_airbyte_unique_key) as active_count
from {{ this }}
where _airbyte_active_row = 1 {{ incremental_clause('_airbyte_normalized_at', this.schema + '.' + adapter.quote('dedup_exchange_rate')) }}
group by _airbyte_unique_key
) active_counts
on recent_records.unique_key = active_counts.unique_key
where active_count is null or active_count = 0
)
{% else %}
-- We have to have a non-empty query, so just do a noop delete
delete from {{ this }} where 1=0
{% endif %}
","drop view _airbyte_test_normalization.dedup_exchange_rate_stg"],
tags = [ "top-level" ]
) }}
-- depends_on: ref('dedup_exchange_rate_stg')
with
{% if is_incremental() %}
new_data as (
-- retrieve incremental "new" data
select
*
from {{ ref('dedup_exchange_rate_stg') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}
),
new_data_ids as (
-- build a subset of _airbyte_unique_key from rows that are new
select distinct
{{ dbt_utils.surrogate_key([
'id',
'currency',
'NZD',
]) }} as _airbyte_unique_key
from new_data
),
empty_new_data as (
-- build an empty table to only keep the table's column types
select * from new_data where 1 = 0
),
previous_active_scd_data as (
-- retrieve "incomplete old" data that needs to be updated with an end date because of new changes
select
{{ star_intersect(ref('dedup_exchange_rate_stg'), this, from_alias='inc_data', intersect_alias='this_data') }}
from {{ this }} as this_data
-- make a join with new_data using primary key to filter active data that need to be updated only
join new_data_ids on this_data._airbyte_unique_key = new_data_ids._airbyte_unique_key
-- force left join to NULL values (we just need to transfer column types only for the star_intersect macro on schema changes)
left join empty_new_data as inc_data on this_data._airbyte_ab_id = inc_data._airbyte_ab_id
where _airbyte_active_row = 1
),
input_data as (
select {{ dbt_utils.star(ref('dedup_exchange_rate_stg')) }} from new_data
union all
select {{ dbt_utils.star(ref('dedup_exchange_rate_stg')) }} from previous_active_scd_data
),
{% else %}
input_data as (
select *
from {{ ref('dedup_exchange_rate_stg') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
),
{% endif %}
scd_data as (
-- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key
select
{{ dbt_utils.surrogate_key([
'id',
'currency',
'NZD',
]) }} as _airbyte_unique_key,
id,
currency,
new_column,
date,
timestamp_col,
HKD_special___characters,
NZD,
USD,
date as _airbyte_start_at,
lag(date) over (
partition by cast(id as {{ dbt_utils.type_string() }}), currency, cast(NZD as {{ dbt_utils.type_string() }})
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) as _airbyte_end_at,
case when row_number() over (
partition by cast(id as {{ dbt_utils.type_string() }}), currency, cast(NZD as {{ dbt_utils.type_string() }})
order by
date is null asc,
date desc,
_airbyte_emitted_at desc
) = 1 then 1 else 0 end as _airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
_airbyte_dedup_exchange_rate_hashid
from input_data
),
dedup_data as (
select
-- we need to ensure de-duplicated rows for merge/update queries
-- additionally, we generate a unique key for the scd table
row_number() over (
partition by
_airbyte_unique_key,
_airbyte_start_at,
_airbyte_emitted_at
order by _airbyte_active_row desc, _airbyte_ab_id
) as _airbyte_row_num,
{{ dbt_utils.surrogate_key([
'_airbyte_unique_key',
'_airbyte_start_at',
'_airbyte_emitted_at'
]) }} as _airbyte_unique_key_scd,
scd_data.*
from scd_data
)
select
_airbyte_unique_key,
_airbyte_unique_key_scd,
id,
currency,
new_column,
date,
timestamp_col,
HKD_special___characters,
NZD,
USD,
_airbyte_start_at,
_airbyte_end_at,
_airbyte_active_row,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from dedup_data where _airbyte_row_num = 1

View File

@@ -0,0 +1,29 @@
{{ config(
cluster_by = ["_airbyte_unique_key","_airbyte_emitted_at"],
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = "_airbyte_unique_key",
schema = "test_normalization",
tags = [ "top-level" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('dedup_exchange_rate_scd') }}
select
_airbyte_unique_key,
id,
currency,
new_column,
date,
timestamp_col,
HKD_special___characters,
NZD,
USD,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_dedup_exchange_rate_hashid
from {{ ref('dedup_exchange_rate_scd') }}
-- dedup_exchange_rate from {{ source('test_normalization', '_airbyte_raw_dedup_exchange_rate') }}
where 1 = 1
and _airbyte_active_row = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,27 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "test_normalization",
tags = [ "top-level" ]
) }}
-- Final base SQL model
-- depends_on: {{ ref('exchange_rate_ab3') }}
select
id,
currency,
new_column,
date,
timestamp_col,
HKD_special___characters,
NZD,
USD,
column___with__quotes,
_airbyte_ab_id,
_airbyte_emitted_at,
{{ current_timestamp() }} as _airbyte_normalized_at,
_airbyte_exchange_rate_hashid
from {{ ref('exchange_rate_ab3') }}
-- exchange_rate from {{ source('test_normalization', '_airbyte_raw_exchange_rate') }}
where 1 = 1

View File

@@ -0,0 +1,26 @@
{{ config(
cluster_by = "_airbyte_emitted_at",
partition_by = {"field": "_airbyte_emitted_at", "data_type": "timestamp", "granularity": "day"},
unique_key = '_airbyte_ab_id',
schema = "_airbyte_test_normalization",
tags = [ "top-level-intermediate" ]
) }}
-- SQL model to build a hash column based on the values of this record
-- depends_on: {{ ref('dedup_exchange_rate_ab2') }}
select
{{ dbt_utils.surrogate_key([
'id',
'currency',
'new_column',
'date',
'timestamp_col',
'HKD_special___characters',
'NZD',
'USD',
]) }} as _airbyte_dedup_exchange_rate_hashid,
tmp.*
from {{ ref('dedup_exchange_rate_ab2') }} tmp
-- dedup_exchange_rate
where 1 = 1
{{ incremental_clause('_airbyte_emitted_at', this) }}

View File

@@ -0,0 +1,12 @@
version: 2
sources:
- name: test_normalization
quoting:
database: true
schema: false
identifier: false
tables:
- name: _airbyte_raw_dedup_cdc_excluded
- name: _airbyte_raw_dedup_exchange_rate
- name: _airbyte_raw_exchange_rate
- name: _airbyte_raw_renamed_dedup_cdc_excluded

View File

@@ -0,0 +1,27 @@
merge into `dataline-integration-testing`.test_normalization.`dedup_exchange_rate_scd` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`dedup_exchange_rate_scd__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on
DBT_INTERNAL_SOURCE._airbyte_unique_key_scd = DBT_INTERNAL_DEST._airbyte_unique_key_scd
when matched then update set
`_airbyte_unique_key` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key`,`_airbyte_unique_key_scd` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key_scd`,`id` = DBT_INTERNAL_SOURCE.`id`,`currency` = DBT_INTERNAL_SOURCE.`currency`,`date` = DBT_INTERNAL_SOURCE.`date`,`timestamp_col` = DBT_INTERNAL_SOURCE.`timestamp_col`,`HKD_special___characters` = DBT_INTERNAL_SOURCE.`HKD_special___characters`,`HKD_special___characters_1` = DBT_INTERNAL_SOURCE.`HKD_special___characters_1`,`NZD` = DBT_INTERNAL_SOURCE.`NZD`,`USD` = DBT_INTERNAL_SOURCE.`USD`,`_airbyte_start_at` = DBT_INTERNAL_SOURCE.`_airbyte_start_at`,`_airbyte_end_at` = DBT_INTERNAL_SOURCE.`_airbyte_end_at`,`_airbyte_active_row` = DBT_INTERNAL_SOURCE.`_airbyte_active_row`,`_airbyte_ab_id` = DBT_INTERNAL_SOURCE.`_airbyte_ab_id`,`_airbyte_emitted_at` = DBT_INTERNAL_SOURCE.`_airbyte_emitted_at`,`_airbyte_normalized_at` = DBT_INTERNAL_SOURCE.`_airbyte_normalized_at`,`_airbyte_dedup_exchange_rate_hashid` = DBT_INTERNAL_SOURCE.`_airbyte_dedup_exchange_rate_hashid`
when not matched then insert
(`_airbyte_unique_key`, `_airbyte_unique_key_scd`, `id`, `currency`, `date`, `timestamp_col`, `HKD_special___characters`, `HKD_special___characters_1`, `NZD`, `USD`, `_airbyte_start_at`, `_airbyte_end_at`, `_airbyte_active_row`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_dedup_exchange_rate_hashid`)
values
(`_airbyte_unique_key`, `_airbyte_unique_key_scd`, `id`, `currency`, `date`, `timestamp_col`, `HKD_special___characters`, `HKD_special___characters_1`, `NZD`, `USD`, `_airbyte_start_at`, `_airbyte_end_at`, `_airbyte_active_row`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_dedup_exchange_rate_hashid`)

View File

@@ -0,0 +1,27 @@
merge into `dataline-integration-testing`.test_normalization.`dedup_exchange_rate` as DBT_INTERNAL_DEST
using (
select * from `dataline-integration-testing`.test_normalization.`dedup_exchange_rate__dbt_tmp`
) as DBT_INTERNAL_SOURCE
on
DBT_INTERNAL_SOURCE._airbyte_unique_key = DBT_INTERNAL_DEST._airbyte_unique_key
when matched then update set
`_airbyte_unique_key` = DBT_INTERNAL_SOURCE.`_airbyte_unique_key`,`id` = DBT_INTERNAL_SOURCE.`id`,`currency` = DBT_INTERNAL_SOURCE.`currency`,`date` = DBT_INTERNAL_SOURCE.`date`,`timestamp_col` = DBT_INTERNAL_SOURCE.`timestamp_col`,`HKD_special___characters` = DBT_INTERNAL_SOURCE.`HKD_special___characters`,`HKD_special___characters_1` = DBT_INTERNAL_SOURCE.`HKD_special___characters_1`,`NZD` = DBT_INTERNAL_SOURCE.`NZD`,`USD` = DBT_INTERNAL_SOURCE.`USD`,`_airbyte_ab_id` = DBT_INTERNAL_SOURCE.`_airbyte_ab_id`,`_airbyte_emitted_at` = DBT_INTERNAL_SOURCE.`_airbyte_emitted_at`,`_airbyte_normalized_at` = DBT_INTERNAL_SOURCE.`_airbyte_normalized_at`,`_airbyte_dedup_exchange_rate_hashid` = DBT_INTERNAL_SOURCE.`_airbyte_dedup_exchange_rate_hashid`
when not matched then insert
(`_airbyte_unique_key`, `id`, `currency`, `date`, `timestamp_col`, `HKD_special___characters`, `HKD_special___characters_1`, `NZD`, `USD`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_dedup_exchange_rate_hashid`)
values
(`_airbyte_unique_key`, `id`, `currency`, `date`, `timestamp_col`, `HKD_special___characters`, `HKD_special___characters_1`, `NZD`, `USD`, `_airbyte_ab_id`, `_airbyte_emitted_at`, `_airbyte_normalized_at`, `_airbyte_dedup_exchange_rate_hashid`)

View File

@@ -0,0 +1,145 @@
create or replace table `dataline-integration-testing`.test_normalization.`exchange_rate`
partition by timestamp_trunc(_airbyte_emitted_at, day)
cluster by _airbyte_emitted_at
OPTIONS()
as (
with __dbt__cte__exchange_rate_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['currency']") as currency,
json_extract_scalar(_airbyte_data, "$['date']") as date,
json_extract_scalar(_airbyte_data, "$['timestamp_col']") as timestamp_col,
json_extract_scalar(_airbyte_data, "$['HKD@spéçiäl & characters']") as HKD_special___characters,
json_extract_scalar(_airbyte_data, "$['HKD_special___characters']") as HKD_special___characters_1,
json_extract_scalar(_airbyte_data, "$['NZD']") as NZD,
json_extract_scalar(_airbyte_data, "$['USD']") as USD,
json_extract_scalar(_airbyte_data, "$['column___with__quotes']") as column___with__quotes,
json_extract_scalar(_airbyte_data, "$['datetime_tz']") as datetime_tz,
json_extract_scalar(_airbyte_data, "$['datetime_no_tz']") as datetime_no_tz,
json_extract_scalar(_airbyte_data, "$['time_tz']") as time_tz,
json_extract_scalar(_airbyte_data, "$['time_no_tz']") as time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate as table_alias
-- exchange_rate
where 1 = 1
), __dbt__cte__exchange_rate_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__exchange_rate_ab1
select
cast(id as
int64
) as id,
cast(currency as
string
) as currency,
cast(nullif(date, '') as
date
) as date,
cast(nullif(timestamp_col, '') as
timestamp
) as timestamp_col,
cast(HKD_special___characters as
float64
) as HKD_special___characters,
cast(HKD_special___characters_1 as
string
) as HKD_special___characters_1,
cast(NZD as
float64
) as NZD,
cast(USD as
float64
) as USD,
cast(column___with__quotes as
string
) as column___with__quotes,
cast(nullif(datetime_tz, '') as
timestamp
) as datetime_tz,
cast(nullif(datetime_no_tz, '') as
datetime
) as datetime_no_tz,
cast(nullif(time_tz, '') as
STRING
) as time_tz,
cast(nullif(time_no_tz, '') as
time
) as time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__exchange_rate_ab1
-- exchange_rate
where 1 = 1
), __dbt__cte__exchange_rate_ab3 as (
-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__exchange_rate_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(currency as
string
), ''), '-', coalesce(cast(date as
string
), ''), '-', coalesce(cast(timestamp_col as
string
), ''), '-', coalesce(cast(HKD_special___characters as
string
), ''), '-', coalesce(cast(HKD_special___characters_1 as
string
), ''), '-', coalesce(cast(NZD as
string
), ''), '-', coalesce(cast(USD as
string
), ''), '-', coalesce(cast(column___with__quotes as
string
), ''), '-', coalesce(cast(datetime_tz as
string
), ''), '-', coalesce(cast(datetime_no_tz as
string
), ''), '-', coalesce(cast(time_tz as
string
), ''), '-', coalesce(cast(time_no_tz as
string
), '')) as
string
))) as _airbyte_exchange_rate_hashid,
tmp.*
from __dbt__cte__exchange_rate_ab2 tmp
-- exchange_rate
where 1 = 1
)-- Final base SQL model
-- depends_on: __dbt__cte__exchange_rate_ab3
select
id,
currency,
date,
timestamp_col,
HKD_special___characters,
HKD_special___characters_1,
NZD,
USD,
column___with__quotes,
datetime_tz,
datetime_no_tz,
time_tz,
time_no_tz,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at,
_airbyte_exchange_rate_hashid
from __dbt__cte__exchange_rate_ab3
-- exchange_rate from `dataline-integration-testing`.test_normalization._airbyte_raw_exchange_rate
where 1 = 1
);

View File

@@ -0,0 +1,89 @@
create or replace view `dataline-integration-testing`._airbyte_test_normalization.`dedup_exchange_rate_stg`
OPTIONS()
as
with __dbt__cte__dedup_exchange_rate_ab1 as (
-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema
-- depends_on: `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate
select
json_extract_scalar(_airbyte_data, "$['id']") as id,
json_extract_scalar(_airbyte_data, "$['currency']") as currency,
json_extract_scalar(_airbyte_data, "$['date']") as date,
json_extract_scalar(_airbyte_data, "$['timestamp_col']") as timestamp_col,
json_extract_scalar(_airbyte_data, "$['HKD@spéçiäl & characters']") as HKD_special___characters,
json_extract_scalar(_airbyte_data, "$['HKD_special___characters']") as HKD_special___characters_1,
json_extract_scalar(_airbyte_data, "$['NZD']") as NZD,
json_extract_scalar(_airbyte_data, "$['USD']") as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from `dataline-integration-testing`.test_normalization._airbyte_raw_dedup_exchange_rate as table_alias
-- dedup_exchange_rate
where 1 = 1
), __dbt__cte__dedup_exchange_rate_ab2 as (
-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type
-- depends_on: __dbt__cte__dedup_exchange_rate_ab1
select
cast(id as
int64
) as id,
cast(currency as
string
) as currency,
cast(nullif(date, '') as
date
) as date,
cast(nullif(timestamp_col, '') as
timestamp
) as timestamp_col,
cast(HKD_special___characters as
float64
) as HKD_special___characters,
cast(HKD_special___characters_1 as
string
) as HKD_special___characters_1,
cast(NZD as
float64
) as NZD,
cast(USD as
float64
) as USD,
_airbyte_ab_id,
_airbyte_emitted_at,
CURRENT_TIMESTAMP() as _airbyte_normalized_at
from __dbt__cte__dedup_exchange_rate_ab1
-- dedup_exchange_rate
where 1 = 1
)-- SQL model to build a hash column based on the values of this record
-- depends_on: __dbt__cte__dedup_exchange_rate_ab2
select
to_hex(md5(cast(concat(coalesce(cast(id as
string
), ''), '-', coalesce(cast(currency as
string
), ''), '-', coalesce(cast(date as
string
), ''), '-', coalesce(cast(timestamp_col as
string
), ''), '-', coalesce(cast(HKD_special___characters as
string
), ''), '-', coalesce(cast(HKD_special___characters_1 as
string
), ''), '-', coalesce(cast(NZD as
string
), ''), '-', coalesce(cast(USD as
string
), '')) as
string
))) as _airbyte_dedup_exchange_rate_hashid,
tmp.*
from __dbt__cte__dedup_exchange_rate_ab2 tmp
-- dedup_exchange_rate
where 1 = 1
;

Some files were not shown because too many files have changed in this diff Show More