1
0
mirror of synced 2026-01-03 06:02:23 -05:00
Files
airbyte/airbyte-cdk/python/airbyte_cdk/sources/singer/singer_helpers.py
Cole Snodgrass 2e099acc52 update headers from 2022 -> 2023 (#22594)
* It's 2023!

* 2022 -> 2023

---------

Co-authored-by: evantahler <evan@airbyte.io>
2023-02-08 13:01:16 -08:00

305 lines
15 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import os
import selectors
import subprocess
from dataclasses import dataclass
from datetime import datetime
from io import TextIOWrapper
from typing import Any, DefaultDict, Dict, Iterator, List, Mapping, Optional, Tuple
from airbyte_cdk.logger import log_by_prefix
from airbyte_cdk.models import (
AirbyteCatalog,
AirbyteMessage,
AirbyteRecordMessage,
AirbyteStateMessage,
AirbyteStream,
ConfiguredAirbyteCatalog,
ConfiguredAirbyteStream,
SyncMode,
Type,
)
_INCREMENTAL = "INCREMENTAL"
_FULL_TABLE = "FULL_TABLE"
def to_json(string):
try:
return json.loads(string)
except ValueError:
return False
def is_field_metadata(metadata):
if len(metadata.get("breadcrumb")) != 2:
return False
else:
return metadata.get("breadcrumb")[0] != "property"
def configured_for_incremental(configured_stream: ConfiguredAirbyteStream):
return configured_stream.sync_mode and configured_stream.sync_mode == SyncMode.incremental
def get_stream_level_metadata(metadatas: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
for metadata in metadatas:
if not is_field_metadata(metadata) and "metadata" in metadata:
return metadata.get("metadata")
return None
@dataclass
class Catalogs:
singer_catalog: object
airbyte_catalog: AirbyteCatalog
@dataclass
class SyncModeInfo:
supported_sync_modes: Optional[List[SyncMode]] = None
source_defined_cursor: Optional[bool] = None
default_cursor_field: Optional[List[str]] = None
def set_sync_modes_from_metadata(airbyte_stream: AirbyteStream, metadatas: List[Dict[str, Any]]):
stream_metadata = get_stream_level_metadata(metadatas)
if stream_metadata:
# A stream is incremental if it declares replication keys or if forced-replication-method is set to incremental
replication_keys = stream_metadata.get("valid-replication-keys", [])
if len(replication_keys) > 0:
airbyte_stream.source_defined_cursor = True
airbyte_stream.supported_sync_modes = [SyncMode.incremental]
# TODO if there are multiple replication keys, allow configuring which one is used. For now we deterministically take the first
airbyte_stream.default_cursor_field = [sorted(replication_keys)[0]]
elif "forced-replication-method" in stream_metadata:
forced_replication_method = stream_metadata["forced-replication-method"]
if isinstance(forced_replication_method, dict):
forced_replication_method = forced_replication_method.get("replication-method", "")
if forced_replication_method.upper() == _INCREMENTAL:
airbyte_stream.source_defined_cursor = True
airbyte_stream.supported_sync_modes = [SyncMode.incremental]
elif forced_replication_method.upper() == _FULL_TABLE:
airbyte_stream.source_defined_cursor = False
airbyte_stream.supported_sync_modes = [SyncMode.full_refresh]
def override_sync_modes(airbyte_stream: AirbyteStream, overrides: SyncModeInfo):
airbyte_stream.source_defined_cursor = overrides.source_defined_cursor or False
if overrides.supported_sync_modes:
airbyte_stream.supported_sync_modes = overrides.supported_sync_modes
if overrides.default_cursor_field is not None:
airbyte_stream.default_cursor_field = overrides.default_cursor_field
class SingerHelper:
@staticmethod
def _transform_types(stream_properties: DefaultDict):
for field_name in stream_properties:
field_object = stream_properties[field_name]
# according to issue CDK: typing errors #9500, mypy raises error on this line
# '"Type[SingerHelper]" has no attribute "_parse_type"', it's need to fix
# ignored for now
field_object["type"] = SingerHelper._parse_type(field_object["type"]) # type: ignore
@staticmethod
def singer_catalog_to_airbyte_catalog(
singer_catalog: Dict[str, Any], sync_mode_overrides: Dict[str, SyncModeInfo], primary_key_overrides: Dict[str, List[str]]
) -> AirbyteCatalog:
"""
:param singer_catalog:
:param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
but not every stream in the catalog should exist in this
:param primary_key_overrides: A dict of stream name -> list of fields to be used as PKs.
:return: Airbyte Catalog
"""
airbyte_streams = []
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'Item "None" of "Optional[Any]" has no attribute "__iter__" (not iterable)'
# It occurs because default value isn't set, and it's None
# It's needed to set default value, ignored for now
for stream in singer_catalog.get("streams"): # type: ignore
name = stream.get("stream")
schema = stream.get("schema")
airbyte_stream = AirbyteStream(name=name, json_schema=schema, supported_sync_modes=[SyncMode.full_refresh])
if name in sync_mode_overrides:
override_sync_modes(airbyte_stream, sync_mode_overrides[name])
else:
set_sync_modes_from_metadata(airbyte_stream, stream.get("metadata", []))
if name in primary_key_overrides:
airbyte_stream.source_defined_primary_key = [[k] for k in primary_key_overrides[name]]
elif stream.get("key_properties"):
airbyte_stream.source_defined_primary_key = [[k] for k in stream["key_properties"]]
airbyte_streams += [airbyte_stream]
return AirbyteCatalog(streams=airbyte_streams)
@staticmethod
def _read_singer_catalog(logger, shell_command: str) -> Mapping[str, Any]:
completed_process = subprocess.run(
shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
)
for line in completed_process.stderr.splitlines():
logger.log(*log_by_prefix(line, "ERROR"))
return json.loads(completed_process.stdout)
@staticmethod
def get_catalogs(
logger,
shell_command: str,
sync_mode_overrides: Dict[str, SyncModeInfo],
primary_key_overrides: Dict[str, List[str]],
excluded_streams: List,
) -> Catalogs:
singer_catalog = SingerHelper._read_singer_catalog(logger, shell_command)
streams = singer_catalog.get("streams", [])
if streams and excluded_streams:
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'Unsupported target for indexed assignment ("Mapping[str, Any]")'
# _read_singer_catalog returns Mapping, to fix this error it should be changed to MutableMapping
# ignored for now
singer_catalog["streams"] = [stream for stream in streams if stream["stream"] not in excluded_streams] # type: ignore
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'Argument 1 to "singer_catalog_to_airbyte_catalog" of "SingerHelper" has incompatible type "Mapping[str, Any]"; expected "Dict[str, Any]"'
# singer_catalog is Mapping, because _read_singer_catalog returns Mapping, but singer_catalog_to_airbyte_catalog expects Dict
# it's needed to check and fix, ignored for now
airbyte_catalog = SingerHelper.singer_catalog_to_airbyte_catalog(singer_catalog, sync_mode_overrides, primary_key_overrides) # type: ignore
return Catalogs(singer_catalog=singer_catalog, airbyte_catalog=airbyte_catalog)
@staticmethod
def read(logger, shell_command, is_message=(lambda x: True)) -> Iterator[AirbyteMessage]:
with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p:
for line, text_wrapper in SingerHelper._read_lines(p):
if text_wrapper is p.stdout:
out_json = to_json(line)
if out_json is not None and is_message(out_json):
message_data = SingerHelper._airbyte_message_from_json(out_json)
if message_data is not None:
yield message_data
else:
logger.log(*log_by_prefix(line, "INFO"))
else:
logger.log(*log_by_prefix(line, "ERROR"))
@staticmethod
def _read_lines(process: subprocess.Popen) -> Iterator[Tuple[str, TextIOWrapper]]:
sel = selectors.DefaultSelector()
# according to issue CDK: typing errors #9500, mypy raises error on this two lines
# 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"'
# 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"'
# It's need to check, ignored for now
sel.register(process.stdout, selectors.EVENT_READ) # type: ignore
sel.register(process.stderr, selectors.EVENT_READ) # type: ignore
eof = False
while not eof:
selects_list = sel.select()
empty_line_counter = 0
for key, _ in selects_list:
# according to issue CDK: typing errors #9500, mypy raises two errors on these lines
# 'Item "int" of "Union[int, HasFileno]" has no attribute "readline"'
# 'Item "HasFileno" of "Union[int, HasFileno]" has no attribute "readline"'
# It's need to check, ignored for now
line = key.fileobj.readline() # type: ignore
if not line:
empty_line_counter += 1
if empty_line_counter >= len(selects_list):
eof = True
try:
process.wait(timeout=60)
except subprocess.TimeoutExpired:
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior'
# It's need to fix, ignored for now
raise Exception(f"Underlying command {process.args} is hanging") # type: ignore
if process.returncode != 0:
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior'
# It's need to fix, ignored for now
raise Exception(f"Underlying command {process.args} failed with exit code {process.returncode}") # type: ignore
else:
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'Incompatible types in "yield" (actual type "Tuple[Any, Union[int, HasFileno]]", expected type "Tuple[str, TextIOWrapper]")'
# It's need to fix, ignored for now
yield line, key.fileobj # type: ignore
@staticmethod
def _airbyte_message_from_json(transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
if transformed_json is None or transformed_json.get("type") == "SCHEMA" or transformed_json.get("type") == "ACTIVATE_VERSION":
return None
elif transformed_json.get("type") == "STATE":
out_record = AirbyteStateMessage(data=transformed_json["value"])
out_message = AirbyteMessage(type=Type.STATE, state=out_record)
else:
# todo: check that messages match the discovered schema
stream_name = transformed_json["stream"]
# according to issue CDK: typing errors #9500, mypy raises error on this line
# 'Incompatible types in assignment (expression has type "AirbyteRecordMessage", variable has type "AirbyteStateMessage")'
# type of out_record is first initialized as AirbyteStateMessage on the line 240
# however AirbyteRecordMessage is assigned on the line below, it causes error
# ignored
out_record = AirbyteRecordMessage( # type: ignore
stream=stream_name,
data=transformed_json["record"],
emitted_at=int(datetime.now().timestamp()) * 1000,
)
out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
return out_message
@staticmethod
def create_singer_catalog_with_selection(masked_airbyte_catalog: ConfiguredAirbyteCatalog, discovered_singer_catalog: object) -> str:
combined_catalog_path = os.path.join("singer_rendered_catalog.json")
masked_singer_streams = []
stream_name_to_configured_stream = {
configured_stream.stream.name: configured_stream for configured_stream in masked_airbyte_catalog.streams
}
# according to issue CDK: typing errors #9500, mypy raises error on this line
# '"object" has no attribute "get"'
# discovered_singer_catalog type is set to object on the line 259, need to check
# ignored for now
for singer_stream in discovered_singer_catalog.get("streams"): # type: ignore
stream_name = singer_stream.get("stream")
if stream_name in stream_name_to_configured_stream:
new_metadatas = []
# support old style catalog.
singer_stream["schema"]["selected"] = True
if singer_stream.get("metadata"):
metadatas = singer_stream.get("metadata")
for metadata in metadatas:
new_metadata = metadata
new_metadata["metadata"]["selected"] = True
if not is_field_metadata(new_metadata):
configured_stream = stream_name_to_configured_stream[stream_name]
if configured_for_incremental(configured_stream):
replication_method = _INCREMENTAL
if configured_stream.cursor_field:
new_metadata["metadata"]["replication-key"] = configured_stream.cursor_field[0]
else:
replication_method = _FULL_TABLE
new_metadata["metadata"]["forced-replication-method"] = replication_method
new_metadata["metadata"]["replication-method"] = replication_method
else:
if "fieldExclusions" in new_metadata["metadata"]:
new_metadata["metadata"]["selected"] = True if not new_metadata["metadata"]["fieldExclusions"] else False
new_metadatas += [new_metadata]
singer_stream["metadata"] = new_metadatas
masked_singer_streams += [singer_stream]
combined_catalog = {"streams": masked_singer_streams}
with open(combined_catalog_path, "w") as fh:
fh.write(json.dumps(combined_catalog))
return combined_catalog_path