352 lines
14 KiB
Python
352 lines
14 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
from typing import List, Mapping
|
|
|
|
import pytest
|
|
from airbyte_cdk.models import AirbyteRecordMessage
|
|
from airbyte_cdk.utils.schema_inferrer import SchemaInferrer, SchemaValidationException
|
|
|
|
NOW = 1234567
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"input_records,expected_schemas",
|
|
[
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": "abc"}},
|
|
{"stream": "my_stream", "data": {"field_A": "def"}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["string", "null"]}}},
|
|
id="test_basic",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": 1.0}},
|
|
{"stream": "my_stream", "data": {"field_A": "abc"}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["number", "string", "null"]}}},
|
|
id="test_deriving_schema_refine",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"obj": {"data": [1.0, 2.0, 3.0]}}},
|
|
{"stream": "my_stream", "data": {"obj": {"other_key": "xyz"}}},
|
|
],
|
|
{
|
|
"my_stream": {
|
|
"obj": {
|
|
"type": ["object", "null"],
|
|
"properties": {
|
|
"data": {"type": ["array", "null"], "items": {"type": ["number", "null"]}},
|
|
"other_key": {"type": ["string", "null"]},
|
|
},
|
|
}
|
|
}
|
|
},
|
|
id="test_derive_schema_for_nested_structures",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": 1}},
|
|
{"stream": "my_stream", "data": {"field_A": 2}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["number", "null"]}}},
|
|
id="test_integer_number",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
],
|
|
{"my_stream": {}},
|
|
id="test_null",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": "abc"}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["string", "null"]}}},
|
|
id="test_null_optional",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc"}}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["object", "null"], "properties": {"nested": {"type": ["string", "null"]}}}}},
|
|
id="test_any_of",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["object", "null"], "properties": {"nested": {"type": ["string", "null"]}}}}},
|
|
id="test_any_of_with_null",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}},
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": "a string"}}},
|
|
],
|
|
{
|
|
"my_stream": {
|
|
"field_A": {
|
|
"type": ["object", "null"],
|
|
"properties": {"nested": {"type": ["string", "null"]}, "nully": {"type": ["string", "null"]}},
|
|
}
|
|
}
|
|
},
|
|
id="test_any_of_with_null_union",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": "a string"}}},
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}},
|
|
],
|
|
{
|
|
"my_stream": {
|
|
"field_A": {
|
|
"type": ["object", "null"],
|
|
"properties": {"nested": {"type": ["string", "null"]}, "nully": {"type": ["string", "null"]}},
|
|
}
|
|
}
|
|
},
|
|
id="test_any_of_with_null_union_changed_order",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": "abc", "nested": {"field_B": None}}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["string", "null"]}, "nested": {"type": ["object", "null"], "properties": {}}}},
|
|
id="test_nested_null",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": "abc", "nested": [{"field_B": None, "field_C": "abc"}]}},
|
|
],
|
|
{
|
|
"my_stream": {
|
|
"field_A": {"type": ["string", "null"]},
|
|
"nested": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": ["object", "null"], "properties": {"field_C": {"type": ["string", "null"]}}},
|
|
},
|
|
}
|
|
},
|
|
id="test_array_nested_null",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": "abc", "nested": None}},
|
|
{"stream": "my_stream", "data": {"field_A": "abc", "nested": [{"field_B": None, "field_C": "abc"}]}},
|
|
],
|
|
{
|
|
"my_stream": {
|
|
"field_A": {"type": ["string", "null"]},
|
|
"nested": {
|
|
"type": ["array", "null"],
|
|
"items": {"type": ["object", "null"], "properties": {"field_C": {"type": ["string", "null"]}}},
|
|
},
|
|
}
|
|
},
|
|
id="test_array_top_level_null",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"stream": "my_stream", "data": {"field_A": None}},
|
|
{"stream": "my_stream", "data": {"field_A": "abc"}},
|
|
],
|
|
{"my_stream": {"field_A": {"type": ["string", "null"]}}},
|
|
id="test_null_string",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{
|
|
"stream": "data_with_nested_arrays",
|
|
"data": {
|
|
"root_property_object": {
|
|
"property_array": [
|
|
{"title": "Nested_1", "type": "multi-value", "value": ["XL"]},
|
|
{
|
|
"title": "Nested_2",
|
|
"type": "location",
|
|
"value": {"nested_key_1": "GB", "nested_key_2": "United Kingdom"},
|
|
},
|
|
],
|
|
}
|
|
},
|
|
},
|
|
],
|
|
{
|
|
"data_with_nested_arrays": {
|
|
"root_property_object": {
|
|
"type": ["object", "null"],
|
|
"properties": {
|
|
"property_array": {
|
|
"type": ["array", "null"],
|
|
"items": {
|
|
"type": ["object", "null"],
|
|
"properties": {
|
|
"title": {"type": ["string", "null"]},
|
|
"type": {"type": ["string", "null"]},
|
|
"value": {
|
|
"anyOf": [
|
|
{"type": "array", "items": {"type": "string"}},
|
|
{
|
|
"type": "object",
|
|
"properties": {"nested_key_1": {"type": "string"}, "nested_key_2": {"type": "string"}},
|
|
},
|
|
]
|
|
},
|
|
},
|
|
},
|
|
}
|
|
},
|
|
}
|
|
}
|
|
},
|
|
id="test_data_with_nested_arrays",
|
|
),
|
|
],
|
|
)
|
|
def test_schema_derivation(input_records: List, expected_schemas: Mapping):
|
|
inferrer = SchemaInferrer()
|
|
for record in input_records:
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=record["stream"], data=record["data"], emitted_at=NOW))
|
|
|
|
for stream_name, expected_schema in expected_schemas.items():
|
|
assert inferrer.get_stream_schema(stream_name) == {
|
|
"$schema": "http://json-schema.org/schema#",
|
|
"type": "object",
|
|
"properties": expected_schema,
|
|
}
|
|
|
|
|
|
_STREAM_NAME = "a stream name"
|
|
_ANY_VALUE = "any value"
|
|
_IS_PK = True
|
|
_IS_CURSOR_FIELD = True
|
|
|
|
|
|
def _create_inferrer_with_required_field(is_pk: bool, field: List[List[str]]) -> SchemaInferrer:
|
|
if is_pk:
|
|
return SchemaInferrer(field)
|
|
return SchemaInferrer([[]], field)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"is_pk",
|
|
[
|
|
pytest.param(_IS_PK, id="required_field_is_pk"),
|
|
pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"),
|
|
],
|
|
)
|
|
def test_field_is_on_root(is_pk: bool):
|
|
inferrer = _create_inferrer_with_required_field(is_pk, [["property"]])
|
|
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"property": _ANY_VALUE}, emitted_at=NOW))
|
|
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property"]
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["type"] == "string"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"is_pk",
|
|
[
|
|
pytest.param(_IS_PK, id="required_field_is_pk"),
|
|
pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"),
|
|
],
|
|
)
|
|
def test_field_is_nested(is_pk: bool):
|
|
inferrer = _create_inferrer_with_required_field(is_pk, [["property", "nested_property"]])
|
|
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"property": {"nested_property": _ANY_VALUE}}, emitted_at=NOW))
|
|
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property"]
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["type"] == "object"
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["required"] == ["nested_property"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"is_pk",
|
|
[
|
|
pytest.param(_IS_PK, id="required_field_is_pk"),
|
|
pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"),
|
|
],
|
|
)
|
|
def test_field_is_composite(is_pk: bool):
|
|
inferrer = _create_inferrer_with_required_field(is_pk, [["property 1"], ["property 2"]])
|
|
inferrer.accumulate(
|
|
AirbyteRecordMessage(stream=_STREAM_NAME, data={"property 1": _ANY_VALUE, "property 2": _ANY_VALUE}, emitted_at=NOW)
|
|
)
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property 1", "property 2"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"is_pk",
|
|
[
|
|
pytest.param(_IS_PK, id="required_field_is_pk"),
|
|
pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"),
|
|
],
|
|
)
|
|
def test_field_is_composite_and_nested(is_pk: bool):
|
|
inferrer = _create_inferrer_with_required_field(is_pk, [["property 1", "nested"], ["property 2"]])
|
|
|
|
inferrer.accumulate(
|
|
AirbyteRecordMessage(stream=_STREAM_NAME, data={"property 1": {"nested": _ANY_VALUE}, "property 2": _ANY_VALUE}, emitted_at=NOW)
|
|
)
|
|
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property 1", "property 2"]
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["type"] == "object"
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 2"]["type"] == "string"
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["required"] == ["nested"]
|
|
assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["properties"]["nested"]["type"] == "string"
|
|
|
|
|
|
def test_given_pk_does_not_exist_when_get_inferred_schemas_then_raise_error():
|
|
inferrer = SchemaInferrer([["pk does not exist"]])
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id": _ANY_VALUE}, emitted_at=NOW))
|
|
|
|
with pytest.raises(SchemaValidationException) as exception:
|
|
inferrer.get_stream_schema(_STREAM_NAME)
|
|
|
|
assert len(exception.value.validation_errors) == 1
|
|
|
|
|
|
def test_given_pk_path_is_partially_valid_when_get_inferred_schemas_then_validation_error_mentions_where_the_issue_is():
|
|
inferrer = SchemaInferrer([["id", "nested pk that does not exist"]])
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id": _ANY_VALUE}, emitted_at=NOW))
|
|
|
|
with pytest.raises(SchemaValidationException) as exception:
|
|
inferrer.get_stream_schema(_STREAM_NAME)
|
|
|
|
assert len(exception.value.validation_errors) == 1
|
|
assert "Path ['id']" in exception.value.validation_errors[0]
|
|
|
|
|
|
def test_given_composite_pk_but_only_one_path_valid_when_get_inferred_schemas_then_valid_path_is_required():
|
|
inferrer = SchemaInferrer([["id 1"], ["id 2"]])
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id 1": _ANY_VALUE}, emitted_at=NOW))
|
|
|
|
with pytest.raises(SchemaValidationException) as exception:
|
|
inferrer.get_stream_schema(_STREAM_NAME)
|
|
|
|
assert exception.value.schema["required"] == ["id 1"]
|
|
|
|
|
|
def test_given_composite_pk_but_only_one_path_valid_when_get_inferred_schemas_then_validation_error_mentions_where_the_issue_is():
|
|
inferrer = SchemaInferrer([["id 1"], ["id 2"]])
|
|
inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id 1": _ANY_VALUE}, emitted_at=NOW))
|
|
|
|
with pytest.raises(SchemaValidationException) as exception:
|
|
inferrer.get_stream_schema(_STREAM_NAME)
|
|
|
|
assert len(exception.value.validation_errors) == 1
|
|
assert "id 2" in exception.value.validation_errors[0]
|