1
0
mirror of synced 2026-01-27 16:02:00 -05:00
Files
airbyte/airbyte-integrations/connectors/source-airtable/source_airtable/source.py
Arsen Losenko 9fb3542160 Source Airtable: skip missing streams (#25946)
* Source Airtable: skip missing streams

* Move stream removal to a separate method, cover with tests

* Update changelog

* Fix flake warnings

* Update docs/integrations/sources/airtable.md

Co-authored-by: Sherif A. Nada <snadalive@gmail.com>

* Update docs/integrations/sources/airtable.md

Co-authored-by: Sherif A. Nada <snadalive@gmail.com>

* Automated Change

* Update link to docs in warning

* Automated Change

* Automated Change

* Automated Change

* “Empty-Commit”

---------

Co-authored-by: Sherif A. Nada <snadalive@gmail.com>
Co-authored-by: arsenlosenko <arsenlosenko@users.noreply.github.com>
2023-05-18 12:15:23 +03:00

106 lines
5.1 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import logging
from typing import Any, Iterable, Iterator, List, Mapping, MutableMapping, Tuple, Union
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog
from airbyte_cdk.sources import AbstractSource
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.utils.schema_helpers import split_config
from .auth import AirtableAuth
from .schema_helpers import SchemaHelpers
from .streams import AirtableBases, AirtableStream, AirtableTables
class SourceAirtable(AbstractSource):
logger: logging.Logger = logging.getLogger("airbyte")
streams_catalog: Iterable[Mapping[str, Any]] = []
_auth: AirtableAuth = None
def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, Any]:
auth = AirtableAuth(config)
try:
# try reading first table from each base, to check the connectivity,
for base in AirtableBases(authenticator=auth).read_records(sync_mode=None):
base_id = base.get("id")
base_name = base.get("name")
self.logger.info(f"Reading first table info for base: {base_name}")
next(AirtableTables(base_id=base_id, authenticator=auth).read_records(sync_mode=None))
return True, None
except Exception as e:
return False, str(e)
def _remove_missed_streams_from_catalog(
self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog
) -> ConfiguredAirbyteCatalog:
config, _ = split_config(config)
stream_instances = {s.name: s for s in self.streams(config)}
for index, configured_stream in enumerate(catalog.streams):
stream_instance = stream_instances.get(configured_stream.stream.name)
if not stream_instance:
table_id = configured_stream.stream.name.split("/")[2]
similar_streams = [s for s in stream_instances if s.endswith(table_id)]
logger.warn(
f"The requested stream {configured_stream.stream.name} was not found in the source. Please check if this stream was renamed or removed previously and reset data, removing from catalog for this sync run. For more information please refer to documentation: https://docs.airbyte.com/integrations/sources/airtable/#note-on-changed-table-names-and-deleted-tables"
f" Similar streams: {similar_streams}"
f" Available streams: {stream_instances.keys()}"
)
del catalog.streams[index]
return catalog
def read(
self,
logger: logging.Logger,
config: Mapping[str, Any],
catalog: ConfiguredAirbyteCatalog,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Iterator[AirbyteMessage]:
"""Override to provide filtering of catalog in case if streams were renamed/removed previously"""
catalog = self._remove_missed_streams_from_catalog(logger, config, catalog)
return super().read(logger, config, catalog, state)
def discover(self, logger: AirbyteLogger, config) -> AirbyteCatalog:
"""
Override to provide the dynamic schema generation capabilities,
using resource available for authenticated user.
Retrieve: Bases, Tables from each Base, generate JSON Schema for each table.
"""
auth = self._auth or AirtableAuth(config)
# list all bases available for authenticated account
for base in AirtableBases(authenticator=auth).read_records(sync_mode=None):
base_id = base.get("id")
base_name = SchemaHelpers.clean_name(base.get("name"))
# list and process each table under each base to generate the JSON Schema
for table in list(AirtableTables(base_id, authenticator=auth).read_records(sync_mode=None)):
self.streams_catalog.append(
{
"stream_path": f"{base_id}/{table.get('id')}",
"stream": SchemaHelpers.get_airbyte_stream(
f"{base_name}/{SchemaHelpers.clean_name(table.get('name'))}/{table.get('id')}",
SchemaHelpers.get_json_schema(table),
),
}
)
return AirbyteCatalog(streams=[stream["stream"] for stream in self.streams_catalog])
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
self._auth = AirtableAuth(config)
# trigger discovery to populate the streams_catalog
if not self.streams_catalog:
self.discover(None, config)
# build the stream class from prepared streams_catalog
for stream in self.streams_catalog:
yield AirtableStream(
stream_path=stream["stream_path"],
stream_name=stream["stream"].name,
stream_schema=stream["stream"].json_schema,
authenticator=self._auth,
)