airbyte/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

import json
from copy import deepcopy
from enum import Enum
from functools import total_ordering
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union

from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError

JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
SchemaType = Dict[str, Dict[str, JsonSchemaSupportedType]]

schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}


@total_ordering
class ComparableType(Enum):
    NULL = 0
    BOOLEAN = 1
    INTEGER = 2
    NUMBER = 3
    STRING = 4
    OBJECT = 5

    def __lt__(self, other: Any) -> bool:
        if self.__class__ is other.__class__:
            return self.value < other.value  # type: ignore
        else:
            return NotImplemented


TYPE_PYTHON_MAPPING: Mapping[str, Tuple[str, Optional[Type[Any]]]] = {
    "null": ("null", None),
    "array": ("array", list),
    "boolean": ("boolean", bool),
    "float": ("number", float),
    "integer": ("integer", int),
    "number": ("number", float),
    "object": ("object", dict),
    "string": ("string", str),
}
PYTHON_TYPE_MAPPING = {t: k for k, (_, t) in TYPE_PYTHON_MAPPING.items()}


def get_comparable_type(value: Any) -> Optional[ComparableType]:
    if value == "null":
        return ComparableType.NULL
    if value == "boolean":
        return ComparableType.BOOLEAN
    if value == "integer":
        return ComparableType.INTEGER
    if value == "number":
        return ComparableType.NUMBER
    if value == "string":
        return ComparableType.STRING
    if value == "object":
        return ComparableType.OBJECT
    else:
        return None


def get_inferred_type(value: Any) -> Optional[ComparableType]:
    if value is None:
        return ComparableType.NULL
    if isinstance(value, bool):
        return ComparableType.BOOLEAN
    if isinstance(value, int):
        return ComparableType.INTEGER
    if isinstance(value, float):
        return ComparableType.NUMBER
    if isinstance(value, str):
        return ComparableType.STRING
    if isinstance(value, dict):
        return ComparableType.OBJECT
    else:
        return None


def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType:
    """
    Returns a new dictionary that contains schema1 and schema2.

    Schemas are merged as follows
    - If a key is in one schema but not the other, add it to the base schema with its existing type.
    - If a key is in both schemas but with different types, use the wider type.
    - If the type is a list in one schema but a different type of element in the other schema, raise an exception.
    - If the type is an object in both schemas but the objects are different raise an exception.
    - If the type is an object in one schema but not in the other schema, raise an exception.

    In other words, we support merging
    - any atomic type with any other atomic type (choose the wider of the two)
    - list with list (union)
    and nothing else.
    """
    for k, t in list(schema1.items()) + list(schema2.items()):
        if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]):
            raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t)

    merged_schema: Dict[str, Any] = deepcopy(schema1)
    for k2, t2 in schema2.items():
        t1 = merged_schema.get(k2)
        if t1 is None:
            merged_schema[k2] = t2
        elif t1 == t2:
            continue
        else:
            merged_schema[k2] = _choose_wider_type(k2, t1, t2)

    return merged_schema


def _is_valid_type(t: JsonSchemaSupportedType) -> bool:
    return t == "array" or get_comparable_type(t) is not None


def _choose_wider_type(key: str, t1: Dict[str, Any], t2: Dict[str, Any]) -> Dict[str, Any]:
    if (t1["type"] == "array" or t2["type"] == "array") and t1 != t2:
        raise SchemaInferenceError(
            FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
            details="Cannot merge schema for unequal array types.",
            key=key,
            detected_types=f"{t1},{t2}",
        )
    elif (t1["type"] == "object" or t2["type"] == "object") and t1 != t2:
        raise SchemaInferenceError(
            FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
            details="Cannot merge schema for unequal object types.",
            key=key,
            detected_types=f"{t1},{t2}",
        )
    else:
        comparable_t1 = get_comparable_type(TYPE_PYTHON_MAPPING[t1["type"]][0])  # accessing the type_mapping value
        comparable_t2 = get_comparable_type(TYPE_PYTHON_MAPPING[t2["type"]][0])  # accessing the type_mapping value
        if not comparable_t1 and comparable_t2:
            raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}")
        return max(
            [t1, t2], key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0]))
        )  # accessing the type_mapping value


def is_equal_or_narrower_type(value: Any, expected_type: str) -> bool:
    if isinstance(value, list):
        # We do not compare lists directly; the individual items are compared.
        # If we hit this condition, it means that the expected type is not
        # compatible with the inferred type.
        return False

    inferred_type = ComparableType(get_inferred_type(value))

    if inferred_type is None:
        return False

    return ComparableType(inferred_type) <= ComparableType(get_comparable_type(expected_type))


def conforms_to_schema(record: Mapping[str, Any], schema: Mapping[str, Any]) -> bool:
    """
    Return true iff the record conforms to the supplied schema.

    The record conforms to the supplied schema iff:
    - All columns in the record are in the schema.
    - For every column in the record, that column's type is equal to or narrower than the same column's
      type in the schema.
    """
    schema_columns = set(schema.get("properties", {}).keys())
    record_columns = set(record.keys())

    if not record_columns.issubset(schema_columns):
        return False

    for column, definition in schema.get("properties", {}).items():
        expected_type = definition.get("type")
        value = record.get(column)

        if value is not None:
            if expected_type == "object":
                return isinstance(value, dict)
            elif expected_type == "array":
                if not isinstance(value, list):
                    return False
                array_type = definition.get("items", {}).get("type")
                if not all(is_equal_or_narrower_type(v, array_type) for v in value):
                    return False
            elif not is_equal_or_narrower_type(value, expected_type):
                return False

    return True


def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[Mapping[str, str]]:
    try:
        if isinstance(input_schema, str):
            schema: Mapping[str, str] = json.loads(input_schema)
        else:
            schema = input_schema
        if not all(isinstance(s, str) for s in schema.values()):
            raise ConfigValidationError(
                FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details="Invalid input schema; nested schemas are not supported."
            )

    except json.decoder.JSONDecodeError:
        return None

    return schema


def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, str]]]) -> Optional[Mapping[str, Any]]:
    """
    Return the user input schema (type mapping), transformed to JSON Schema format.

    Verify that the input schema:
        - is a key:value map
        - all values in the map correspond to a JsonSchema datatype
    """
    if not input_schema:
        return None

    result_schema = {}

    json_mapping = _parse_json_input(input_schema) or {}

    for col_name, type_name in json_mapping.items():
        col_name, type_name = col_name.strip(), type_name.strip()
        if not (col_name and type_name):
            raise ConfigValidationError(
                FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
                details=f"Invalid input schema; expected mapping in the format column_name: type, got {input_schema}.",
            )

        _json_schema_type = TYPE_PYTHON_MAPPING.get(type_name.casefold())

        if not _json_schema_type:
            raise ConfigValidationError(
                FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details=f"Invalid type '{type_name}' for property '{col_name}'."
            )

        json_schema_type = _json_schema_type[0]
        result_schema[col_name] = {"type": json_schema_type}

    return {"type": "object", "properties": result_schema}