* Try running only on modified files * make a change * return something with the wrong type * Revert "return something with the wrong type" This reverts commit23b828371e. * fix typing in file-based * format * Mypy * fix * leave as Mapping * Revert "leave as Mapping" This reverts commit908f063f70. * Use Dict * update * move dict() * Revert "move dict()" This reverts commitfa347a8236. * Revert "Revert "move dict()"" This reverts commitc9237df2e4. * Revert "Revert "Revert "move dict()""" This reverts commit5ac1616414. * use Mapping * point to config file * comment * strict = False * remove -- * Revert "comment" This reverts commit6000814a82. * install types * install types in same command as mypy runs * non-interactive * freeze version * pydantic plugin * plugins * update * ignore missing import * Revert "ignore missing import" This reverts commit1da7930fb7. * Install pydantic instead * fix * this passes locally * strict = true * format * explicitly import models * Update * remove old mypy.ini config * temporarily disable mypy * format * any * format * fix tests * format * Automated Commit - Formatting Changes * Revert "temporarily disable mypy" This reverts commiteb8470fa3f. * implicit reexport * update test * fix mypy * Automated Commit - Formatting Changes * fix some errors in tests * more type fixes * more fixes * more * . * done with tests * fix last files * format * Update gradle * change source-stripe * only run mypy on cdk * remove strict * Add more rules * update * ignore missing imports * cast to string * Allow untyped decorator * reset to master * move to the cdk * derp * move explicit imports around * Automated Commit - Formatting Changes * Revert "move explicit imports around" This reverts commit56e306b72f. * move explicit imports around * Upgrade mypy version * point to config file * Update readme * Ignore errors in the models module * Automated Commit - Formatting Changes * move check to gradle build * Any * try checking out master too * Revert "try checking out master too" This reverts commit8a8f3e373c. * fetch master * install mypy * try without origin * fetch from the script * checkout master * ls the branches * remotes/origin/master * remove some cruft * comment * remove pydantic types * unpin mypy * fetch from the script * Update connectors base too * modify a non-cdk file to confirm it doesn't get checked by mypy * run mypy after generateComponentManifestClassFiles * run from the venv * pass files as arguments * update * fix when running without args * with subdir * path * try without / * ./ * remove filter * try resetting * Revert "try resetting" This reverts commit3a54c424de. * exclude autogen file * do not use the github action * works locally * remove extra fetch * run on connectors base * try bad typing * Revert "try bad typing" This reverts commit33b512a3e4. * reset stripe * Revert "reset stripe" This reverts commit28f23fc6dd. * Revert "Revert "reset stripe"" This reverts commit5bf5dee371. * missing return type * do not ignore the autogen file * remove extra installs * run from venv * Only check files modified on current branch * Revert "Only check files modified on current branch" This reverts commitb4b728e654. * use merge-base * Revert "use merge-base" This reverts commit3136670cbf. * try with updated mypy * bump * run other steps after mypy * reset task ordering * run mypy though * looser config * tests pass * fix mypy issues * type: ignore * optional * this is always a bool * ignore * fix typing issues * remove ignore * remove mapping * Automated Commit - Formatting Changes * Revert "remove ignore" This reverts commit9ffeeb6cb1. * update config --------- Co-authored-by: girarda <girarda@users.noreply.github.com> Co-authored-by: Joe Bell <joseph.bell@airbyte.io>
243 lines
8.6 KiB
Python
243 lines
8.6 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
import json
|
|
from copy import deepcopy
|
|
from enum import Enum
|
|
from functools import total_ordering
|
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union
|
|
|
|
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError
|
|
|
|
JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
|
|
SchemaType = Dict[str, Dict[str, JsonSchemaSupportedType]]
|
|
|
|
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
|
|
|
|
|
|
@total_ordering
|
|
class ComparableType(Enum):
|
|
NULL = 0
|
|
BOOLEAN = 1
|
|
INTEGER = 2
|
|
NUMBER = 3
|
|
STRING = 4
|
|
OBJECT = 5
|
|
|
|
def __lt__(self, other: Any) -> bool:
|
|
if self.__class__ is other.__class__:
|
|
return self.value < other.value # type: ignore
|
|
else:
|
|
return NotImplemented
|
|
|
|
|
|
TYPE_PYTHON_MAPPING: Mapping[str, Tuple[str, Optional[Type[Any]]]] = {
|
|
"null": ("null", None),
|
|
"array": ("array", list),
|
|
"boolean": ("boolean", bool),
|
|
"float": ("number", float),
|
|
"integer": ("integer", int),
|
|
"number": ("number", float),
|
|
"object": ("object", dict),
|
|
"string": ("string", str),
|
|
}
|
|
|
|
|
|
def get_comparable_type(value: Any) -> Optional[ComparableType]:
|
|
if value == "null":
|
|
return ComparableType.NULL
|
|
if value == "boolean":
|
|
return ComparableType.BOOLEAN
|
|
if value == "integer":
|
|
return ComparableType.INTEGER
|
|
if value == "number":
|
|
return ComparableType.NUMBER
|
|
if value == "string":
|
|
return ComparableType.STRING
|
|
if value == "object":
|
|
return ComparableType.OBJECT
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_inferred_type(value: Any) -> Optional[ComparableType]:
|
|
if value is None:
|
|
return ComparableType.NULL
|
|
if isinstance(value, bool):
|
|
return ComparableType.BOOLEAN
|
|
if isinstance(value, int):
|
|
return ComparableType.INTEGER
|
|
if isinstance(value, float):
|
|
return ComparableType.NUMBER
|
|
if isinstance(value, str):
|
|
return ComparableType.STRING
|
|
if isinstance(value, dict):
|
|
return ComparableType.OBJECT
|
|
else:
|
|
return None
|
|
|
|
|
|
def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType:
|
|
"""
|
|
Returns a new dictionary that contains schema1 and schema2.
|
|
|
|
Schemas are merged as follows
|
|
- If a key is in one schema but not the other, add it to the base schema with its existing type.
|
|
- If a key is in both schemas but with different types, use the wider type.
|
|
- If the type is a list in one schema but a different type of element in the other schema, raise an exception.
|
|
- If the type is an object in both schemas but the objects are different raise an exception.
|
|
- If the type is an object in one schema but not in the other schema, raise an exception.
|
|
|
|
In other words, we support merging
|
|
- any atomic type with any other atomic type (choose the wider of the two)
|
|
- list with list (union)
|
|
and nothing else.
|
|
"""
|
|
for k, t in list(schema1.items()) + list(schema2.items()):
|
|
if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]):
|
|
raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t)
|
|
|
|
merged_schema: Dict[str, Any] = deepcopy(schema1)
|
|
for k2, t2 in schema2.items():
|
|
t1 = merged_schema.get(k2)
|
|
if t1 is None:
|
|
merged_schema[k2] = t2
|
|
elif t1 == t2:
|
|
continue
|
|
else:
|
|
merged_schema[k2] = _choose_wider_type(k2, t1, t2)
|
|
|
|
return merged_schema
|
|
|
|
|
|
def _is_valid_type(t: JsonSchemaSupportedType) -> bool:
|
|
return t == "array" or get_comparable_type(t) is not None
|
|
|
|
|
|
def _choose_wider_type(key: str, t1: Dict[str, Any], t2: Dict[str, Any]) -> Dict[str, Any]:
|
|
if (t1["type"] == "array" or t2["type"] == "array") and t1 != t2:
|
|
raise SchemaInferenceError(
|
|
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
|
|
details="Cannot merge schema for unequal array types.",
|
|
key=key,
|
|
detected_types=f"{t1},{t2}",
|
|
)
|
|
elif (t1["type"] == "object" or t2["type"] == "object") and t1 != t2:
|
|
raise SchemaInferenceError(
|
|
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
|
|
details="Cannot merge schema for unequal object types.",
|
|
key=key,
|
|
detected_types=f"{t1},{t2}",
|
|
)
|
|
else:
|
|
comparable_t1 = get_comparable_type(TYPE_PYTHON_MAPPING[t1["type"]][0]) # accessing the type_mapping value
|
|
comparable_t2 = get_comparable_type(TYPE_PYTHON_MAPPING[t2["type"]][0]) # accessing the type_mapping value
|
|
if not comparable_t1 and comparable_t2:
|
|
raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}")
|
|
return max(
|
|
[t1, t2], key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0]))
|
|
) # accessing the type_mapping value
|
|
|
|
|
|
def is_equal_or_narrower_type(value: Any, expected_type: str) -> bool:
|
|
if isinstance(value, list):
|
|
# We do not compare lists directly; the individual items are compared.
|
|
# If we hit this condition, it means that the expected type is not
|
|
# compatible with the inferred type.
|
|
return False
|
|
|
|
inferred_type = ComparableType(get_inferred_type(value))
|
|
|
|
if inferred_type is None:
|
|
return False
|
|
|
|
return ComparableType(inferred_type) <= ComparableType(get_comparable_type(expected_type))
|
|
|
|
|
|
def conforms_to_schema(record: Mapping[str, Any], schema: Mapping[str, Any]) -> bool:
|
|
"""
|
|
Return true iff the record conforms to the supplied schema.
|
|
|
|
The record conforms to the supplied schema iff:
|
|
- All columns in the record are in the schema.
|
|
- For every column in the record, that column's type is equal to or narrower than the same column's
|
|
type in the schema.
|
|
"""
|
|
schema_columns = set(schema.get("properties", {}).keys())
|
|
record_columns = set(record.keys())
|
|
|
|
if not record_columns.issubset(schema_columns):
|
|
return False
|
|
|
|
for column, definition in schema.get("properties", {}).items():
|
|
expected_type = definition.get("type")
|
|
value = record.get(column)
|
|
|
|
if value is not None:
|
|
if expected_type == "object":
|
|
return isinstance(value, dict)
|
|
elif expected_type == "array":
|
|
if not isinstance(value, list):
|
|
return False
|
|
array_type = definition.get("items", {}).get("type")
|
|
if not all(is_equal_or_narrower_type(v, array_type) for v in value):
|
|
return False
|
|
elif not is_equal_or_narrower_type(value, expected_type):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[Mapping[str, str]]:
|
|
try:
|
|
if isinstance(input_schema, str):
|
|
schema: Mapping[str, str] = json.loads(input_schema)
|
|
else:
|
|
schema = input_schema
|
|
if not all(isinstance(s, str) for s in schema.values()):
|
|
raise ConfigValidationError(
|
|
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details="Invalid input schema; nested schemas are not supported."
|
|
)
|
|
|
|
except json.decoder.JSONDecodeError:
|
|
return None
|
|
|
|
return schema
|
|
|
|
|
|
def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, str]]]) -> Optional[Mapping[str, Any]]:
|
|
"""
|
|
Return the user input schema (type mapping), transformed to JSON Schema format.
|
|
|
|
Verify that the input schema:
|
|
- is a key:value map
|
|
- all values in the map correspond to a JsonSchema datatype
|
|
"""
|
|
if not input_schema:
|
|
return None
|
|
|
|
result_schema = {}
|
|
|
|
json_mapping = _parse_json_input(input_schema) or {}
|
|
|
|
for col_name, type_name in json_mapping.items():
|
|
col_name, type_name = col_name.strip(), type_name.strip()
|
|
if not (col_name and type_name):
|
|
raise ConfigValidationError(
|
|
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
|
|
details=f"Invalid input schema; expected mapping in the format column_name: type, got {input_schema}.",
|
|
)
|
|
|
|
_json_schema_type = TYPE_PYTHON_MAPPING.get(type_name.casefold())
|
|
|
|
if not _json_schema_type:
|
|
raise ConfigValidationError(
|
|
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details=f"Invalid type '{type_name}' for property '{col_name}'."
|
|
)
|
|
|
|
json_schema_type = _json_schema_type[0]
|
|
result_schema[col_name] = {"type": json_schema_type}
|
|
|
|
return {"type": "object", "properties": result_schema}
|