1
0
mirror of synced 2026-01-03 06:02:23 -05:00
Files
airbyte/airbyte-cdk/python/airbyte_cdk/sources/streams/http/availability_strategy.py
midavadim c44c3eae48 CDK: availability check - handle HttpErrors which happen during slice extraction (#26630)
* for availability check - handle  HttError happens during slice extraction (reading of parent stream),
updated reason messages,
moved check availability call under common try/except which handles errors during usual stream read,
moved log messages which indicate start of the stream sync before availability check in to make to understand which stream is the source of errors

* why do we return here and not try next stream?

* fixed bug in CheckStream, now we try to check availability for all streams
2023-06-23 13:15:25 -04:00

144 lines
6.7 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import logging
import typing
from typing import Dict, Optional, Tuple
import requests
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
from airbyte_cdk.sources.streams.utils.stream_helper import get_first_record_for_slice, get_first_stream_slice
from requests import HTTPError
if typing.TYPE_CHECKING:
from airbyte_cdk.sources import Source
class HttpAvailabilityStrategy(AvailabilityStrategy):
def check_availability(self, stream: Stream, logger: logging.Logger, source: Optional["Source"]) -> Tuple[bool, Optional[str]]:
"""
Check stream availability by attempting to read the first record of the
stream.
:param stream: stream
:param logger: source logger
:param source: (optional) source
:return: A tuple of (boolean, str). If boolean is true, then the stream
is available, and no str is required. Otherwise, the stream is unavailable
for some reason and the str should describe what went wrong and how to
resolve the unavailability, if possible.
"""
try:
# Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter)
# Streams that don't need a stream slice will return `None` as their first stream slice.
stream_slice = get_first_stream_slice(stream)
except StopIteration:
# If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!)
# This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield <something>`
# without accounting for the case in which the parent stream is empty.
reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty."
return False, reason
except HTTPError as error:
is_available, reason = self.handle_http_error(stream, logger, source, error)
if not is_available:
reason = f"Unable to get slices for {stream.name} stream, because of error in parent stream. {reason}"
return is_available, reason
try:
get_first_record_for_slice(stream, stream_slice)
return True, None
except StopIteration:
logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.")
return True, None
except HTTPError as error:
is_available, reason = self.handle_http_error(stream, logger, source, error)
if not is_available:
reason = f"Unable to read {stream.name} stream. {reason}"
return is_available, reason
def handle_http_error(
self, stream: Stream, logger: logging.Logger, source: Optional["Source"], error: HTTPError
) -> Tuple[bool, Optional[str]]:
"""
Override this method to define error handling for various `HTTPError`s
that are raised while attempting to check a stream's availability.
Checks whether an error's status_code is in a list of unavailable_error_codes,
and gets the associated reason for that error.
:param stream: stream
:param logger: source logger
:param source: optional (source)
:param error: HTTPError raised while checking stream's availability.
:return: A tuple of (boolean, str). If boolean is true, then the stream
is available, and no str is required. Otherwise, the stream is unavailable
for some reason and the str should describe what went wrong and how to
resolve the unavailability, if possible.
"""
status_code = error.response.status_code
known_status_codes = self.reasons_for_unavailable_status_codes(stream, logger, source, error)
known_reason = known_status_codes.get(status_code)
if not known_reason:
# If the HTTPError is not in the dictionary of errors we know how to handle, don't except
raise error
doc_ref = self._visit_docs_message(logger, source)
reason = f"The endpoint {error.response.url} returned {status_code}: {error.response.reason}. {known_reason}. {doc_ref} "
response_error_message = stream.parse_response_error_message(error.response)
if response_error_message:
reason += response_error_message
return False, reason
def reasons_for_unavailable_status_codes(
self, stream: Stream, logger: logging.Logger, source: Optional["Source"], error: HTTPError
) -> Dict[int, str]:
"""
Returns a dictionary of HTTP status codes that indicate stream
unavailability and reasons explaining why a given status code may
have occurred and how the user can resolve that error, if applicable.
:param stream: stream
:param logger: source logger
:param source: optional (source)
:return: A dictionary of (status code, reason) where the 'reason' explains
why 'status code' may have occurred and how the user can resolve that
error, if applicable.
"""
reasons_for_codes: Dict[int, str] = {
requests.codes.FORBIDDEN: "This is most likely due to insufficient permissions on the credentials in use. "
"Try to grant required permissions/scopes or re-authenticate"
}
return reasons_for_codes
@staticmethod
def _visit_docs_message(logger: logging.Logger, source: Optional["Source"]) -> str:
"""
Creates a message indicating where to look in the documentation for
more information on a given source by checking the spec of that source
(if provided) for a 'documentationUrl'.
:param logger: source logger
:param source: optional (source)
:return: A message telling the user where to go to learn more about the source.
"""
if not source:
return "Please visit the connector's documentation to learn more. "
try:
connector_spec = source.spec(logger)
docs_url = connector_spec.documentationUrl
if docs_url:
return f"Please visit {docs_url} to learn more. "
else:
return "Please visit the connector's documentation to learn more. "
except FileNotFoundError: # If we are unit testing without implementing spec() method in source
if source:
docs_url = f"https://docs.airbyte.com/integrations/sources/{source.name}"
else:
docs_url = "https://docs.airbyte.com/integrations/sources/test"
return f"Please visit {docs_url} to learn more."