## What Migrating Pydantic V2 for Protocol Messages to speed up emitting records. This gives us 2.5x boost over V1. Close https://github.com/airbytehq/airbyte-internal-issues/issues/8333 ## How - Switch to using protocol models generated for pydantic_v2, in a new (temporary) package, `airbyte-protocol-models-pdv2` . - Update pydantic dependency of the CDK accordingly to v2. - For minimal impact, still use the compatibility code `pydantic.v1` in all of our pydantic code from airbyte-cdk that does not interact with the protocol models. ## Review guide 1. Checkout the code and clear your CDK virtual env (either `rm -rf .venv && python -m venv .venv` or `poetry env list; poetry env remove <env>`. This is necessary to fully clean out the `airbyte_protocol` library, for some reason. Then: `poetry lock --no-update && poetry install --all-extras`. This should install the CDK with new models. 2. Run unit tests on the CDK 3. Take your favorite connector and point it's `pyproject.toml` on local CDK (see example in `source-s3`) and try running it's tests and it's regression tests. ## User Impact > [!warning] > This is a major CDK change due to the pydantic dependency change - if connectors use pydantic 1.10, they will break and will need to do similar `from pydantic.v1` updates to get running again. Therefore, we should release this as a major CDK version bump. ## Can this PR be safely reverted and rolled back? - [x] YES 💚 - [ ] NO ❌ Even if sources migrate to this version, state format should not change, so a revert should be possible. ## Follow up work - Ella to move into issues <details> ### Source-s3 - turn this into an issue - [ ] Update source s3 CDK version and any required code changes - [ ] Fix source-s3 unit tests - [ ] Run source-s3 regression tests - [ ] Merge and release source-s3 by June 21st ### Docs - [ ] Update documentation on how to build with CDK ### CDK pieces - [ ] Update file-based CDK format validation to use Pydantic V2 - This is doable, and requires a breaking change to change `OneOfOptionConfig`. There are a few unhandled test cases that present issues we're unsure of how to handle so far. - [ ] Update low-code component generators to use Pydantic V2 - This is doable, there are a few issues around custom component generation that are unhandled. ### Further CDK performance work - create issues for these - [ ] Research if we can replace prints with buffered output (write to byte buffer and then flush to stdout) - [ ] Replace `json` with `orjson` ... </details>
224 lines
8.3 KiB
Python
224 lines
8.3 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
|
|
import importlib
|
|
import json
|
|
import os
|
|
import pkgutil
|
|
from typing import Any, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Tuple
|
|
|
|
import jsonref
|
|
from airbyte_cdk.models import ConnectorSpecification, FailureType
|
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
from jsonschema import RefResolver, validate
|
|
from jsonschema.exceptions import ValidationError
|
|
from pydantic.v1 import BaseModel, Field
|
|
|
|
|
|
class JsonFileLoader:
|
|
"""
|
|
Custom json file loader to resolve references to resources located in "shared" directory.
|
|
We need this for compatability with existing schemas cause all of them have references
|
|
pointing to shared_schema.json file instead of shared/shared_schema.json
|
|
"""
|
|
|
|
def __init__(self, uri_base: str, shared: str):
|
|
self.shared = shared
|
|
self.uri_base = uri_base
|
|
|
|
def __call__(self, uri: str) -> Dict[str, Any]:
|
|
uri = uri.replace(self.uri_base, f"{self.uri_base}/{self.shared}/")
|
|
with open(uri) as f:
|
|
data = json.load(f)
|
|
if isinstance(data, dict):
|
|
return data
|
|
else:
|
|
raise ValueError(f"Expected to read a dictionary from {uri}. Got: {data}")
|
|
|
|
|
|
def resolve_ref_links(obj: Any) -> Any:
|
|
"""
|
|
Scan resolved schema and convert jsonref.JsonRef object to JSON serializable dict.
|
|
|
|
:param obj - jsonschema object with ref field resolved.
|
|
:return JSON serializable object with references without external dependencies.
|
|
"""
|
|
if isinstance(obj, jsonref.JsonRef):
|
|
obj = resolve_ref_links(obj.__subject__)
|
|
# Omit existing definitions for external resource since
|
|
# we dont need it anymore.
|
|
if isinstance(obj, dict):
|
|
obj.pop("definitions", None)
|
|
return obj
|
|
else:
|
|
raise ValueError(f"Expected obj to be a dict. Got {obj}")
|
|
elif isinstance(obj, dict):
|
|
return {k: resolve_ref_links(v) for k, v in obj.items()}
|
|
elif isinstance(obj, list):
|
|
return [resolve_ref_links(item) for item in obj]
|
|
else:
|
|
return obj
|
|
|
|
|
|
def _expand_refs(schema: Any, ref_resolver: Optional[RefResolver] = None) -> None:
|
|
"""Internal function to iterate over schema and replace all occurrences of $ref with their definitions. Recursive.
|
|
|
|
:param schema: schema that will be patched
|
|
:param ref_resolver: resolver to get definition from $ref, if None pass it will be instantiated
|
|
"""
|
|
ref_resolver = ref_resolver or RefResolver.from_schema(schema)
|
|
|
|
if isinstance(schema, MutableMapping):
|
|
if "$ref" in schema:
|
|
ref_url = schema.pop("$ref")
|
|
_, definition = ref_resolver.resolve(ref_url)
|
|
_expand_refs(definition, ref_resolver=ref_resolver) # expand refs in definitions as well
|
|
schema.update(definition)
|
|
else:
|
|
for key, value in schema.items():
|
|
_expand_refs(value, ref_resolver=ref_resolver)
|
|
elif isinstance(schema, List):
|
|
for value in schema:
|
|
_expand_refs(value, ref_resolver=ref_resolver)
|
|
|
|
|
|
def expand_refs(schema: Any) -> None:
|
|
"""Iterate over schema and replace all occurrences of $ref with their definitions.
|
|
|
|
:param schema: schema that will be patched
|
|
"""
|
|
_expand_refs(schema)
|
|
schema.pop("definitions", None) # remove definitions created by $ref
|
|
|
|
|
|
def rename_key(schema: Any, old_key: str, new_key: str) -> None:
|
|
"""Iterate over nested dictionary and replace one key with another. Used to replace anyOf with oneOf. Recursive."
|
|
|
|
:param schema: schema that will be patched
|
|
:param old_key: name of the key to replace
|
|
:param new_key: new name of the key
|
|
"""
|
|
if not isinstance(schema, MutableMapping):
|
|
return
|
|
|
|
for key, value in schema.items():
|
|
rename_key(value, old_key, new_key)
|
|
if old_key in schema:
|
|
schema[new_key] = schema.pop(old_key)
|
|
|
|
|
|
class ResourceSchemaLoader:
|
|
"""JSONSchema loader from package resources"""
|
|
|
|
def __init__(self, package_name: str):
|
|
self.package_name = package_name
|
|
|
|
def get_schema(self, name: str) -> dict[str, Any]:
|
|
"""
|
|
This method retrieves a JSON schema from the schemas/ folder.
|
|
|
|
|
|
The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs
|
|
living inside the "schemas/shared/" folder. For example:
|
|
|
|
schemas/shared/<shared_definition>.json
|
|
schemas/<name>.json # contains a $ref to shared_definition
|
|
schemas/<name2>.json # contains a $ref to shared_definition
|
|
"""
|
|
|
|
schema_filename = f"schemas/{name}.json"
|
|
raw_file = pkgutil.get_data(self.package_name, schema_filename)
|
|
if not raw_file:
|
|
raise IOError(f"Cannot find file {schema_filename}")
|
|
try:
|
|
raw_schema = json.loads(raw_file)
|
|
except ValueError as err:
|
|
raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err
|
|
|
|
return self._resolve_schema_references(raw_schema)
|
|
|
|
def _resolve_schema_references(self, raw_schema: dict[str, Any]) -> dict[str, Any]:
|
|
"""
|
|
Resolve links to external references and move it to local "definitions" map.
|
|
|
|
:param raw_schema jsonschema to lookup for external links.
|
|
:return JSON serializable object with references without external dependencies.
|
|
"""
|
|
|
|
package = importlib.import_module(self.package_name)
|
|
if package.__file__:
|
|
base = os.path.dirname(package.__file__) + "/"
|
|
else:
|
|
raise ValueError(f"Package {package} does not have a valid __file__ field")
|
|
resolved = jsonref.JsonRef.replace_refs(raw_schema, loader=JsonFileLoader(base, "schemas/shared"), base_uri=base)
|
|
resolved = resolve_ref_links(resolved)
|
|
if isinstance(resolved, dict):
|
|
return resolved
|
|
else:
|
|
raise ValueError(f"Expected resolved to be a dict. Got {resolved}")
|
|
|
|
|
|
def check_config_against_spec_or_exit(config: Mapping[str, Any], spec: ConnectorSpecification) -> None:
|
|
"""
|
|
Check config object against spec. In case of spec is invalid, throws
|
|
an exception with validation error description.
|
|
|
|
:param config - config loaded from file specified over command line
|
|
:param spec - spec object generated by connector
|
|
"""
|
|
spec_schema = spec.connectionSpecification
|
|
try:
|
|
validate(instance=config, schema=spec_schema)
|
|
except ValidationError as validation_error:
|
|
raise AirbyteTracedException(
|
|
message="Config validation error: " + validation_error.message,
|
|
internal_message=validation_error.message,
|
|
failure_type=FailureType.config_error,
|
|
) from None # required to prevent logging config secrets from the ValidationError's stacktrace
|
|
|
|
|
|
class InternalConfig(BaseModel):
|
|
KEYWORDS: ClassVar[set[str]] = {"_limit", "_page_size"}
|
|
limit: int = Field(None, alias="_limit")
|
|
page_size: int = Field(None, alias="_page_size")
|
|
|
|
def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
|
|
kwargs["by_alias"] = True
|
|
kwargs["exclude_unset"] = True
|
|
return super().dict(*args, **kwargs) # type: ignore[no-any-return]
|
|
|
|
def is_limit_reached(self, records_counter: int) -> bool:
|
|
"""
|
|
Check if record count reached limit set by internal config.
|
|
:param records_counter - number of records already red
|
|
:return True if limit reached, False otherwise
|
|
"""
|
|
if self.limit:
|
|
if records_counter >= self.limit:
|
|
return True
|
|
return False
|
|
|
|
|
|
def split_config(config: Mapping[str, Any]) -> Tuple[dict[str, Any], InternalConfig]:
|
|
"""
|
|
Break config map object into 2 instances: first is a dict with user defined
|
|
configuration and second is internal config that contains private keys for
|
|
acceptance test configuration.
|
|
|
|
:param
|
|
config - Dict object that has been loaded from config file.
|
|
|
|
:return tuple of user defined config dict with filtered out internal
|
|
parameters and connector acceptance test internal config object.
|
|
"""
|
|
main_config = {}
|
|
internal_config = {}
|
|
for k, v in config.items():
|
|
if k in InternalConfig.KEYWORDS:
|
|
internal_config[k] = v
|
|
else:
|
|
main_config[k] = v
|
|
return main_config, InternalConfig.parse_obj(internal_config)
|