1
0
mirror of synced 2026-01-07 18:06:03 -05:00
Files
airbyte/airbyte-integrations/connectors/source-s3/integration_tests/integration_test_abstract.py
Artem Inzhyyants 93f3286a0d 🚨🚨Source S3: use platform-handled schema evolution (#25127)
* Source S3: Remove match_target_schema; use platform-handled schema evolution instead

* Source S3: Remove ab_additional_col

* Source S3: update docs; bump version

* Source S3: fix unit tests

* Source S3: fix expected_records

* Source S3: revert _match_target_schema

* Source S3: update expected records for parquet dataset

* Source S3: update metadata

* auto-bump connector version

---------

Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
2023-05-15 17:14:26 +02:00

527 lines
20 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Iterator, List, Mapping
from uuid import uuid4
import jsonschema
import pytest
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import SyncMode
from source_s3.source_files_abstract.formats.csv_parser import CsvParser
from source_s3.source_files_abstract.stream import FileStream
HERE = Path(__file__).resolve().parent
SAMPLE_DIR = HERE.joinpath("sample_files/")
LOGGER = AirbyteLogger()
JSONTYPE_TO_PYTHONTYPE = {"string": str, "number": float, "integer": int, "object": dict, "array": list, "boolean": bool, "null": None}
class AbstractTestIncrementalFileStream(ABC):
"""Prefix this class with Abstract so the tests don't run here but only in the children"""
temp_bucket_prefix = "airbytetest-"
@pytest.fixture(scope="session")
def cloud_bucket_prefix(self) -> str:
return self.temp_bucket_prefix
@pytest.fixture(scope="session")
def format(self) -> Mapping[str, Any]:
return {"filetype": "csv"}
@pytest.fixture(scope="session")
def airbyte_system_columns(self) -> Mapping[str, str]:
return {
FileStream.ab_last_mod_col: {"type": "string", "format": "date-time"},
FileStream.ab_file_name_col: {"type": "string"}
}
@property
@abstractmethod
def stream_class(self) -> type:
"""
:return: provider specific FileStream class (e.g. IncrementalFileStreamS3)
"""
@property
@abstractmethod
def credentials(self) -> Mapping:
"""
These will be added automatically to the provider property
:return: mapping of provider specific credentials
"""
@abstractmethod
def provider(self, bucket_name: str) -> Mapping:
"""
:return: provider specific provider dict as described in spec.json (leave out credentials, they will be added automatically)
"""
@abstractmethod
def cloud_files(self, cloud_bucket_name: str, credentials: Mapping, files_to_upload: List, private: bool = True) -> Iterator[str]:
"""
See S3 for example what the override of this needs to achieve.
:param cloud_bucket_name: name of bucket (or equivalent)
:param credentials: mapping of provider specific credentials
:param files_to_upload: list of paths to local files to upload, pass empty list to test zero files case
:param private: whether or not to make the files private and require credentials to read, defaults to True
:yield: url filepath to uploaded file
"""
@abstractmethod
def teardown_infra(self, cloud_bucket_name: str, credentials: Mapping) -> None:
"""
Provider-specific logic to tidy up any cloud resources.
See S3 for example.
:param cloud_bucket_name: bucket (or equivalent) name
:param credentials: mapping of provider specific credentials
"""
def _stream_records_test_logic(
self,
cloud_bucket_name: str,
format: Mapping[str, str],
airbyte_system_columns: Mapping[str, str],
sync_mode: Any,
files: List[str],
path_pattern: str,
private: bool,
num_columns: Any,
num_records: Any,
expected_schema: Mapping[str, Any],
user_schema: Mapping[str, Any],
fails: Any,
state: Any = None,
) -> Any:
uploaded_files = [fpath for fpath in self.cloud_files(cloud_bucket_name, self.credentials, files, private)]
LOGGER.info(f"file(s) uploaded: {uploaded_files}")
# emulate state for incremental testing
# since we're not actually saving state out to file here, we pass schema in to our FileStream creation...
# this isn't how it will work in Airbyte but it's a close enough emulation
current_state = state if state is not None else {FileStream.ab_last_mod_col: "1970-01-01T00:00:00Z"}
if (user_schema is None) and ("schema" in current_state.keys()):
user_schema = current_state["schema"]
full_expected_schema = {
"type": "object",
"properties": {**expected_schema, **airbyte_system_columns},
}
str_user_schema = str(user_schema).replace("'", '"') if user_schema is not None else None
total_num_columns = num_columns + len(airbyte_system_columns.keys())
provider = {**self.provider(cloud_bucket_name), **self.credentials} if private else self.provider(cloud_bucket_name)
if not fails:
fs = self.stream_class("dataset", provider, format, path_pattern, str_user_schema)
LOGGER.info(f"Testing stream_records() in SyncMode:{sync_mode.value}")
# check we return correct schema from get_json_schema()
assert fs.get_json_schema() == full_expected_schema
records = []
for stream_slice in fs.stream_slices(sync_mode=sync_mode, stream_state=current_state):
if stream_slice is not None:
# we need to do this in order to work out which extra columns (if any) we expect in this stream_slice
expected_columns = []
for file_dict in stream_slice["files"]:
# TODO: if we ever test other filetypes in these tests this will need fixing
file_reader = CsvParser(format)
storage_file = file_dict["storage_file"]
with storage_file.open(file_reader.is_binary) as f:
expected_columns.extend(list(file_reader.get_inferred_schema(f, storage_file.file_info).keys()))
expected_columns = set(expected_columns) # de-dupe
for record in fs.read_records(sync_mode, stream_slice=stream_slice):
# check actual record values match expected schema
jsonschema.validate(record, full_expected_schema)
records.append(record)
assert all([len(r.keys()) == total_num_columns for r in records])
assert len(records) == num_records
# returning state by simulating call to get_updated_state() with final record so we can test incremental
return fs.get_updated_state(current_stream_state=current_state, latest_record=records[-1])
else:
with pytest.raises(Exception) as e_info:
fs = self.stream_class("dataset", provider, format, path_pattern, str_user_schema)
LOGGER.info(f"Testing EXPECTED FAILURE stream_records() in SyncMode:{sync_mode.value}")
fs.get_json_schema()
records = []
for stream_slice in fs.stream_slices(sync_mode=sync_mode, stream_state=current_state):
for record in fs.read_records(sync_mode, stream_slice=stream_slice):
records.append(record)
LOGGER.info(f"Failed as expected, error: {e_info}")
@pytest.mark.parametrize(
# make user_schema None to test auto-inference. Exclude any _airbyte system columns in expected_schema.
"files, path_pattern, private, num_columns, num_records, expected_schema, user_schema, incremental, fails",
[
# single file tests
( # public
[SAMPLE_DIR.joinpath("simple_test.csv")],
"**",
False,
3,
8,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
( # private
[SAMPLE_DIR.joinpath("simple_test.csv")],
"**",
True,
3,
8,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
( # provided schema exact match to actual schema
[SAMPLE_DIR.joinpath("simple_test.csv")],
"**",
True,
3,
8,
{"id": "integer", "name": "string", "valid": "boolean"},
{"id": "integer", "name": "string", "valid": "boolean"},
False,
False,
),
( # provided schema not matching datatypes, expect successful coercion
[SAMPLE_DIR.joinpath("simple_test.csv")],
"**",
True,
3,
8,
{"id": "string", "name": "string", "valid": "string"},
{"id": "string", "name": "string", "valid": "string"},
False,
False,
),
( # provided incompatible schema, expect fail
[SAMPLE_DIR.joinpath("simple_test.csv")],
"**",
True,
3,
8,
{"id": "boolean", "name": "boolean", "valid": "boolean"},
{"id": "boolean", "name": "boolean", "valid": "boolean"},
False,
True,
),
# multiple file tests (all have identical schemas)
( # public, auto-infer
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
False,
3,
17,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
( # private, auto-infer
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
3,
17,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
( # provided schema exact match to actual schema
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
3,
17,
{"id": "integer", "name": "string", "valid": "boolean"},
{"id": "integer", "name": "string", "valid": "boolean"},
False,
False,
),
( # provided schema not matching datatypes, expect successful coercion
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
3,
17,
{"id": "string", "name": "string", "valid": "string"},
{"id": "string", "name": "string", "valid": "string"},
False,
False,
),
( # provided incompatible schema, expect fail
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
3,
17,
{"id": "boolean", "name": "boolean", "valid": "boolean"},
{"id": "boolean", "name": "boolean", "valid": "boolean"},
False,
True,
),
( # provided schema, not containing all columns (extra columns should go into FileStream.ab_additional_col)
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_1.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_2.csv"),
],
"**",
True,
3,
17,
{"id": "integer", "name": "string", "valid": "boolean"},
{"id": "integer", "name": "string", "valid": "boolean"},
False,
False,
),
# pattern matching tests with additional files present that we don't want to read
( # at top-level of bucket
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("file_to_skip.csv"),
SAMPLE_DIR.joinpath("file_to_skip.txt"),
],
"simple*",
True,
3,
11,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
( # at multiple levels of bucket
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("file_to_skip.csv"),
SAMPLE_DIR.joinpath("file_to_skip.txt"),
SAMPLE_DIR.joinpath("pattern_match_test/this_folder/simple_test.csv"),
SAMPLE_DIR.joinpath("pattern_match_test/not_this_folder/file_to_skip.csv"),
SAMPLE_DIR.joinpath("pattern_match_test/not_this_folder/file_to_skip.txt"),
],
"**/simple*",
True,
3,
19,
{"id": "integer", "name": "string", "valid": "boolean"},
None,
False,
False,
),
# incremental tests (passing num_records/num_columns/fails as lists holding value for each file in order)
( # auto-infer, all same schema
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
[3, 3, 3],
[8, 3, 6],
{"id": "integer", "name": "string", "valid": "boolean"},
None,
True,
[False, False, False],
),
( # provided schema, all same schema
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("simple_test_2.csv"),
SAMPLE_DIR.joinpath("simple_test_3.csv"),
],
"**",
True,
[3, 3, 3],
[8, 3, 6],
{"id": "integer", "name": "string", "valid": "boolean"},
{"id": "integer", "name": "string", "valid": "boolean"},
True,
[False, False, False],
),
( # auto-infer, (different but merge-able schemas)
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_1.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_2.csv"),
],
"**",
True,
[3, 3, 3],
[8, 3, 6],
{"id": "integer", "name": "string", "valid": "boolean"},
None,
True,
[False, False, False],
),
( # same as previous but change order and expect 5 columns instead of 3 in all
[
SAMPLE_DIR.joinpath("multi_file_diffschema_2.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_1.csv"),
SAMPLE_DIR.joinpath("simple_test.csv"),
],
"**",
True,
[5, 5, 5],
[6, 3, 8],
{"id": "integer", "name": "string", "valid": "boolean", "percentage": "number", "nullable": "string"},
None,
True,
[False, False, False],
),
( # like previous test but with a user_schema limiting columns
[
SAMPLE_DIR.joinpath("multi_file_diffschema_2.csv"),
SAMPLE_DIR.joinpath("multi_file_diffschema_1.csv"),
SAMPLE_DIR.joinpath("simple_test.csv"),
],
"**",
True,
[2, 2, 2],
[6, 3, 8],
{"id": "integer", "name": "string"},
{"id": "integer", "name": "string"},
True,
[False, False, False],
),
( # fail when 2nd file has incompatible schema, auto-infer
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("incompatible_schema.csv"),
],
"**",
True,
[3, 3],
[8, 8],
{"id": "integer", "name": "string", "valid": "boolean"},
None,
True,
[False, True],
),
( # fail when 2nd file has incompatible schema, provided schema
[
SAMPLE_DIR.joinpath("simple_test.csv"),
SAMPLE_DIR.joinpath("incompatible_schema.csv"),
],
"**",
True,
[3, 3],
[8, 8],
{"id": "integer", "name": "string", "valid": "boolean"},
{"id": "integer", "name": "string", "valid": "boolean"},
True,
[False, True],
),
],
)
def test_stream_records(
self,
cloud_bucket_prefix: str,
format: Mapping[str, Any],
airbyte_system_columns: Mapping[str, str],
files: List[str],
path_pattern: str,
private: bool,
num_columns: List[int],
num_records: List[int],
expected_schema: Mapping[str, Any],
user_schema: Mapping[str, Any],
incremental: bool,
fails: List[bool],
) -> None:
expected_schema = {k: {"type": ["null", v]} for k, v in expected_schema.items()}
try:
if not incremental: # we expect matching behaviour here in either sync_mode
for sync_mode in [
SyncMode("full_refresh"),
SyncMode("incremental"),
]:
cloud_bucket_name = f"{cloud_bucket_prefix}{uuid4()}"
self._stream_records_test_logic(
cloud_bucket_name,
format,
airbyte_system_columns,
sync_mode,
files,
path_pattern,
private,
num_columns,
num_records,
expected_schema,
user_schema,
fails,
)
self.teardown_infra(cloud_bucket_name, self.credentials)
else:
cloud_bucket_name = f"{cloud_bucket_prefix}{uuid4()}"
latest_state = None
for i in range(len(files)):
latest_state = self._stream_records_test_logic(
cloud_bucket_name,
format,
airbyte_system_columns,
SyncMode("incremental"),
[files[i]],
path_pattern,
private,
num_columns[i],
num_records[i],
expected_schema,
user_schema,
fails[i],
state=latest_state,
)
LOGGER.info(f"incremental state: {latest_state}")
# small delay to ensure next file gets later last_modified timestamp
time.sleep(1)
self.teardown_infra(cloud_bucket_name, self.credentials)
except Exception as e:
self.teardown_infra(cloud_bucket_name, self.credentials)
raise e