1
0
mirror of synced 2026-01-01 18:02:53 -05:00
Files
airbyte/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py
Brian Lai 186580a6ee [low-code] replace emptySchemaLoader with DefaultSchemaLoader (#18947)
* replace emptySchemaLoader with DefaultSchemaLoader

* fix test name

* fix test

* add logging for when we default to the empty schema

* increment patch version

* fix formatting

* update changelog
2022-11-03 23:25:01 -04:00

147 lines
6.5 KiB
Python

#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#
from dataclasses import InitVar, dataclass, field
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
from airbyte_cdk.models import SyncMode
from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever
from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader
from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
from airbyte_cdk.sources.declarative.types import Config, StreamSlice
from airbyte_cdk.sources.streams.core import Stream
from dataclasses_jsonschema import JsonSchemaMixin
@dataclass
class DeclarativeStream(Stream, JsonSchemaMixin):
"""
DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever
Attributes:
name (str): stream name
primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
schema_loader (SchemaLoader): The schema loader
retriever (Retriever): The retriever
config (Config): The user-provided configuration as specified by the source's spec
stream_cursor_field (Optional[List[str]]): The cursor field
transformations (List[RecordTransformation]): A list of transformations to be applied to each output record in the
stream. Transformations are applied in the order in which they are defined.
checkpoint_interval (Optional[int]): How often the stream will checkpoint state (i.e: emit a STATE message)
"""
retriever: Retriever
config: Config
options: InitVar[Mapping[str, Any]]
name: str
primary_key: Optional[Union[str, List[str], List[List[str]]]]
schema_loader: Optional[SchemaLoader] = None
_name: str = field(init=False, repr=False, default="")
_primary_key: str = field(init=False, repr=False, default="")
_schema_loader: SchemaLoader = field(init=False, repr=False, default=None)
stream_cursor_field: Optional[Union[List[str], str]] = None
transformations: List[RecordTransformation] = None
checkpoint_interval: Optional[int] = None
def __post_init__(self, options: Mapping[str, Any]):
self.stream_cursor_field = self.stream_cursor_field or []
self.transformations = self.transformations or []
self._schema_loader = self.schema_loader if self.schema_loader else DefaultSchemaLoader(config=self.config, options=options)
@property
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
return self._primary_key
@primary_key.setter
def primary_key(self, value: str) -> None:
if not isinstance(value, property):
self._primary_key = value
@property
def name(self) -> str:
"""
:return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
"""
return self._name
@name.setter
def name(self, value: str) -> None:
if not isinstance(value, property):
self._name = value
@property
def state_checkpoint_interval(self) -> Optional[int]:
"""
Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading
100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.
Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.
return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in
ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of
created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.
"""
return self.checkpoint_interval
@property
def state(self) -> MutableMapping[str, Any]:
return self.retriever.state
@state.setter
def state(self, value: MutableMapping[str, Any]):
"""State setter, accept state serialized by state getter."""
self.retriever.state = value
def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
return self.state
@property
def cursor_field(self) -> Union[str, List[str]]:
"""
Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
:return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
"""
return self.stream_cursor_field
def read_records(
self,
sync_mode: SyncMode,
cursor_field: List[str] = None,
stream_slice: Mapping[str, Any] = None,
stream_state: Mapping[str, Any] = None,
) -> Iterable[Mapping[str, Any]]:
for record in self.retriever.read_records(sync_mode, cursor_field, stream_slice, stream_state):
yield self._apply_transformations(record, self.config, stream_slice)
def _apply_transformations(self, record: Mapping[str, Any], config: Config, stream_slice: StreamSlice):
output_record = record
for transformation in self.transformations:
output_record = transformation.transform(record, config=config, stream_state=self.state, stream_slice=stream_slice)
return output_record
def get_json_schema(self) -> Mapping[str, Any]:
"""
:return: A dict of the JSON schema representing this stream.
The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
Override as needed.
"""
return self._schema_loader.get_json_schema()
def stream_slices(
self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
) -> Iterable[Optional[Mapping[str, Any]]]:
"""
Override to define the slices for this stream. See the stream slicing section of the docs for more information.
:param sync_mode:
:param cursor_field:
:param stream_state:
:return:
"""
# this is not passing the cursor field because it is known at init time
return self.retriever.stream_slices(sync_mode=sync_mode, stream_state=stream_state)