1
0
mirror of synced 2026-01-26 04:01:39 -05:00
Files
airbyte/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py

242 lines
12 KiB
Python

# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Iterable, Mapping, Optional
from airbyte_cdk.sources.types import StreamSlice
from .cursor import Cursor
class CheckpointMode(Enum):
INCREMENTAL = "incremental"
RESUMABLE_FULL_REFRESH = "resumable_full_refresh"
FULL_REFRESH = "full_refresh"
FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True}
class CheckpointReader(ABC):
"""
CheckpointReader manages how to iterate over a stream's partitions and serves as the bridge for interpreting the current state
of the stream that should be emitted back to the platform.
"""
@abstractmethod
def next(self) -> Optional[Mapping[str, Any]]:
"""
Returns the next slice that will be used to fetch the next group of records. Returning None indicates that the reader
has finished iterating over all slices.
"""
@abstractmethod
def observe(self, new_state: Mapping[str, Any]) -> None:
"""
Updates the internal state of the checkpoint reader based on the incoming stream state from a connector.
WARNING: This is used to retain backwards compatibility with streams using the legacy get_stream_state() method.
In order to uptake Resumable Full Refresh, connectors must migrate streams to use the state setter/getter methods.
"""
@abstractmethod
def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
"""
Retrieves the current state value of the stream. The connector does not emit state messages if the checkpoint value is None.
"""
class IncrementalCheckpointReader(CheckpointReader):
"""
IncrementalCheckpointReader handles iterating through a stream based on partitioned windows of data that are determined
before syncing data.
"""
def __init__(self, stream_state: Mapping[str, Any], stream_slices: Iterable[Optional[Mapping[str, Any]]]):
self._state: Optional[Mapping[str, Any]] = stream_state
self._stream_slices = iter(stream_slices)
self._has_slices = False
def next(self) -> Optional[Mapping[str, Any]]:
try:
next_slice = next(self._stream_slices)
self._has_slices = True
return next_slice
except StopIteration:
# This is used to avoid sending a duplicate state message at the end of a sync since the stream has already
# emitted state at the end of each slice. If we want to avoid this extra complexity, we can also just accept
# that every sync emits a final duplicate state
if self._has_slices:
self._state = None
return None
def observe(self, new_state: Mapping[str, Any]) -> None:
self._state = new_state
def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
return self._state
class CursorBasedCheckpointReader(CheckpointReader):
"""
CursorBasedCheckpointReader is used by streams that implement a Cursor in order to manage state. This allows the checkpoint
reader to delegate the complexity of fetching state to the cursor and focus on the iteration over a stream's partitions.
This reader supports the Cursor interface used by Python and low-code sources. Not to be confused with Cursor interface
that belongs to the Concurrent CDK.
"""
def __init__(self, cursor: Cursor, stream_slices: Iterable[Optional[Mapping[str, Any]]], read_state_from_cursor: bool = False):
self._cursor = cursor
self._stream_slices = iter(stream_slices)
# read_state_from_cursor is used to delineate that partitions should determine when to stop syncing dynamically according
# to the value of the state at runtime. This currently only applies to streams that use resumable full refresh.
self._read_state_from_cursor = read_state_from_cursor
self._current_slice: Optional[StreamSlice] = None
self._finished_sync = False
def next(self) -> Optional[Mapping[str, Any]]:
"""
The next() method returns the next slice of data should be synced for the current stream according to its cursor.
This function support iterating over a stream's slices across two dimensions. The first dimension is the stream's
partitions like parent records for a substream. The inner dimension is iterating over the cursor value like a
date range for incremental streams or a pagination checkpoint for resumable full refresh.
basic algorithm for iterating through a stream's slices is:
1. The first time next() is invoked we get the first partition and return it
2. For streams whose cursor value is determined dynamically using stream state
1. Get the current state for the current partition
2. If the current partition's state is complete, get the next partition
3. If the current partition's state is still in progress, emit the next cursor value
3. If a stream has processed all partitions, the iterator will raise a StopIteration exception signaling there are no more
slices left for extracting more records.
"""
try:
if self._read_state_from_cursor:
# We need to check that `current_slice is None` as opposed to `not current_slice` because the current_slice
# could be the empty StreamSlice() which derives to the falsy empty mapping {}. The slice still requires
# iterating over the cursor state in the else block until it hits the terminal value
if self.current_slice is None:
next_slice = self._get_next_slice()
state_for_slice = self._cursor.select_state(next_slice)
if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
# This is a dummy initialization since we'll iterate at least once to get the next slice
next_candidate_slice = StreamSlice(cursor_slice={}, partition={})
has_more = True
while has_more:
next_candidate_slice = self._get_next_slice()
state_for_slice = self._cursor.select_state(next_candidate_slice)
has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
self.current_slice = StreamSlice(cursor_slice=state_for_slice or {}, partition=next_candidate_slice.partition)
else:
self.current_slice = StreamSlice(cursor_slice=state_for_slice or {}, partition=next_slice.partition)
else:
state_for_slice = self._cursor.select_state(self.current_slice)
if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
# Skip every slice that already has the terminal complete value indicating that a previous attempt
# successfully synced the slice
next_candidate_slice = None
has_more = True
while has_more:
next_candidate_slice = self._get_next_slice()
state_for_slice = self._cursor.select_state(next_candidate_slice)
has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
self.current_slice = StreamSlice(cursor_slice=state_for_slice or {}, partition=next_candidate_slice.partition)
else:
self.current_slice = StreamSlice(cursor_slice=state_for_slice or {}, partition=self.current_slice.partition)
else:
# Unlike RFR cursors that iterate dynamically based on how stream state is updated, most cursors operate on a
# fixed set of slices determined before reading records. They should just iterate to the next slice
self.current_slice = self._get_next_slice()
return self.current_slice
except StopIteration:
self._finished_sync = True
return None
def _get_next_slice(self) -> StreamSlice:
next_slice = next(self._stream_slices)
if not isinstance(next_slice, StreamSlice):
raise ValueError(
f"{self._current_slice} should be of type StreamSlice. This is likely a bug in the CDK, please contact Airbyte support"
)
return next_slice
def observe(self, new_state: Mapping[str, Any]) -> None:
# Cursor based checkpoint readers don't need to observe the new state because it has already been updated by the cursor
# while processing records
pass
def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
# This is used to avoid sending a duplicate state message at the end of a sync since the stream has already
# emitted state at the end of each slice. We only emit state if _current_slice is None which indicates we had no
# slices and emitted no record or are currently in the process of emitting records.
if self._current_slice is None or not self._finished_sync:
return self._cursor.get_stream_state()
else:
return None
@property
def current_slice(self) -> Optional[StreamSlice]:
return self._current_slice
@current_slice.setter
def current_slice(self, value: StreamSlice) -> None:
self._current_slice = value
class ResumableFullRefreshCheckpointReader(CheckpointReader):
"""
ResumableFullRefreshCheckpointReader allows for iteration over an unbounded set of records based on the pagination strategy
of the stream. Because the number of pages is unknown, the stream's current state is used to determine whether to continue
fetching more pages or stopping the sync.
"""
def __init__(self, stream_state: Mapping[str, Any]):
# The first attempt of an RFR stream has an empty {} incoming state, but should still make a first attempt to read records
# from the first page in next().
self._first_page = bool(stream_state == {})
self._state: Mapping[str, Any] = stream_state
def next(self) -> Optional[Mapping[str, Any]]:
if self._first_page:
self._first_page = False
return self._state
elif self._state == FULL_REFRESH_COMPLETE_STATE:
return None
else:
return self._state
def observe(self, new_state: Mapping[str, Any]) -> None:
self._state = new_state
def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
return self._state or {}
class FullRefreshCheckpointReader(CheckpointReader):
"""
FullRefreshCheckpointReader iterates over data that cannot be checkpointed incrementally during the sync because the stream
is not capable of managing state. At the end of a sync, a final state message is emitted to signal completion.
"""
def __init__(self, stream_slices: Iterable[Optional[Mapping[str, Any]]]):
self._stream_slices = iter(stream_slices)
self._final_checkpoint = False
def next(self) -> Optional[Mapping[str, Any]]:
try:
return next(self._stream_slices)
except StopIteration:
self._final_checkpoint = True
return None
def observe(self, new_state: Mapping[str, Any]) -> None:
pass
def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
if self._final_checkpoint:
return {"__ab_no_cursor_state_message": True}
return None