1
0
mirror of synced 2026-01-31 19:01:59 -05:00

refactor!(airbyte-cdk): remove availability strategy (#40682)

Signed-off-by: Artem Inzhyyants <artem.inzhyyants@gmail.com>
This commit is contained in:
Artem Inzhyyants
2024-07-17 11:55:51 +02:00
committed by GitHub
parent 536e8e21c8
commit 2bfb2264d9
10 changed files with 18 additions and 287 deletions

View File

@@ -4,9 +4,9 @@
import concurrent
import logging
from queue import Queue
from typing import Iterable, Iterator, List, Optional, Tuple
from typing import Iterable, Iterator, List
from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus
from airbyte_cdk.models import AirbyteMessage
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
@@ -19,7 +19,6 @@ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partitio
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
class ConcurrentSource:
@@ -81,15 +80,6 @@ class ConcurrentSource:
streams: List[AbstractStream],
) -> Iterator[AirbyteMessage]:
self._logger.info("Starting syncing")
stream_instances_to_read_from, not_available_streams = self._get_streams_to_read_from(streams)
for stream, message in not_available_streams:
self._logger.warning(f"Skipped syncing stream '{stream.name}' because it was unavailable. {message}")
yield stream_status_as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
# Return early if there are no streams to read from
if not stream_instances_to_read_from:
return
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
@@ -97,7 +87,7 @@ class ConcurrentSource:
# information and might even need to be configurable depending on the source
queue: Queue[QueueItem] = Queue(maxsize=10_000)
concurrent_stream_processor = ConcurrentReadProcessor(
stream_instances_to_read_from,
streams,
PartitionEnqueuer(queue, self._threadpool),
self._threadpool,
self._logger,
@@ -155,22 +145,3 @@ class ConcurrentSource:
yield from concurrent_stream_processor.on_record(queue_item)
else:
raise ValueError(f"Unknown queue item type: {type(queue_item)}")
def _get_streams_to_read_from(
self, streams: List[AbstractStream]
) -> Tuple[List[AbstractStream], List[Tuple[AbstractStream, Optional[str]]]]:
"""
Iterate over the configured streams and return a list of streams to read from.
If a stream is not configured, it will be skipped.
If a stream is configured but does not exist in the source and self.raise_exception_on_missing_stream is True, an exception will be raised
If a stream is not available, it will be skipped
"""
stream_instances_to_read_from = []
not_available_streams = []
for stream in streams:
stream_availability = stream.check_availability()
if not stream_availability.is_available():
not_available_streams.append((stream, stream_availability.message()))
continue
stream_instances_to_read_from.append(stream)
return stream_instances_to_read_from, not_available_streams