305 lines
15 KiB
Python
305 lines
15 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
|
|
import json
|
|
import os
|
|
import selectors
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from io import TextIOWrapper
|
|
from typing import Any, DefaultDict, Dict, Iterator, List, Mapping, Optional, Tuple
|
|
|
|
from airbyte_cdk.logger import log_by_prefix
|
|
from airbyte_cdk.models import (
|
|
AirbyteCatalog,
|
|
AirbyteMessage,
|
|
AirbyteRecordMessage,
|
|
AirbyteStateMessage,
|
|
AirbyteStream,
|
|
ConfiguredAirbyteCatalog,
|
|
ConfiguredAirbyteStream,
|
|
SyncMode,
|
|
Type,
|
|
)
|
|
|
|
_INCREMENTAL = "INCREMENTAL"
|
|
_FULL_TABLE = "FULL_TABLE"
|
|
|
|
|
|
def to_json(string):
|
|
try:
|
|
return json.loads(string)
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
def is_field_metadata(metadata):
|
|
if len(metadata.get("breadcrumb")) != 2:
|
|
return False
|
|
else:
|
|
return metadata.get("breadcrumb")[0] != "property"
|
|
|
|
|
|
def configured_for_incremental(configured_stream: ConfiguredAirbyteStream):
|
|
return configured_stream.sync_mode and configured_stream.sync_mode == SyncMode.incremental
|
|
|
|
|
|
def get_stream_level_metadata(metadatas: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
|
for metadata in metadatas:
|
|
if not is_field_metadata(metadata) and "metadata" in metadata:
|
|
return metadata.get("metadata")
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class Catalogs:
|
|
singer_catalog: object
|
|
airbyte_catalog: AirbyteCatalog
|
|
|
|
|
|
@dataclass
|
|
class SyncModeInfo:
|
|
supported_sync_modes: Optional[List[SyncMode]] = None
|
|
source_defined_cursor: Optional[bool] = None
|
|
default_cursor_field: Optional[List[str]] = None
|
|
|
|
|
|
def set_sync_modes_from_metadata(airbyte_stream: AirbyteStream, metadatas: List[Dict[str, Any]]):
|
|
stream_metadata = get_stream_level_metadata(metadatas)
|
|
if stream_metadata:
|
|
# A stream is incremental if it declares replication keys or if forced-replication-method is set to incremental
|
|
replication_keys = stream_metadata.get("valid-replication-keys", [])
|
|
if len(replication_keys) > 0:
|
|
airbyte_stream.source_defined_cursor = True
|
|
airbyte_stream.supported_sync_modes = [SyncMode.incremental]
|
|
# TODO if there are multiple replication keys, allow configuring which one is used. For now we deterministically take the first
|
|
airbyte_stream.default_cursor_field = [sorted(replication_keys)[0]]
|
|
elif "forced-replication-method" in stream_metadata:
|
|
forced_replication_method = stream_metadata["forced-replication-method"]
|
|
if isinstance(forced_replication_method, dict):
|
|
forced_replication_method = forced_replication_method.get("replication-method", "")
|
|
if forced_replication_method.upper() == _INCREMENTAL:
|
|
airbyte_stream.source_defined_cursor = True
|
|
airbyte_stream.supported_sync_modes = [SyncMode.incremental]
|
|
elif forced_replication_method.upper() == _FULL_TABLE:
|
|
airbyte_stream.source_defined_cursor = False
|
|
airbyte_stream.supported_sync_modes = [SyncMode.full_refresh]
|
|
|
|
|
|
def override_sync_modes(airbyte_stream: AirbyteStream, overrides: SyncModeInfo):
|
|
airbyte_stream.source_defined_cursor = overrides.source_defined_cursor or False
|
|
if overrides.supported_sync_modes:
|
|
airbyte_stream.supported_sync_modes = overrides.supported_sync_modes
|
|
if overrides.default_cursor_field is not None:
|
|
airbyte_stream.default_cursor_field = overrides.default_cursor_field
|
|
|
|
|
|
class SingerHelper:
|
|
@staticmethod
|
|
def _transform_types(stream_properties: DefaultDict):
|
|
for field_name in stream_properties:
|
|
field_object = stream_properties[field_name]
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# '"Type[SingerHelper]" has no attribute "_parse_type"', it's need to fix
|
|
# ignored for now
|
|
field_object["type"] = SingerHelper._parse_type(field_object["type"]) # type: ignore
|
|
|
|
@staticmethod
|
|
def singer_catalog_to_airbyte_catalog(
|
|
singer_catalog: Dict[str, Any], sync_mode_overrides: Dict[str, SyncModeInfo], primary_key_overrides: Dict[str, List[str]]
|
|
) -> AirbyteCatalog:
|
|
"""
|
|
:param singer_catalog:
|
|
:param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
|
|
but not every stream in the catalog should exist in this
|
|
:param primary_key_overrides: A dict of stream name -> list of fields to be used as PKs.
|
|
:return: Airbyte Catalog
|
|
"""
|
|
airbyte_streams = []
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'Item "None" of "Optional[Any]" has no attribute "__iter__" (not iterable)'
|
|
# It occurs because default value isn't set, and it's None
|
|
# It's needed to set default value, ignored for now
|
|
for stream in singer_catalog.get("streams"): # type: ignore
|
|
name = stream.get("stream")
|
|
schema = stream.get("schema")
|
|
airbyte_stream = AirbyteStream(name=name, json_schema=schema, supported_sync_modes=[SyncMode.full_refresh])
|
|
if name in sync_mode_overrides:
|
|
override_sync_modes(airbyte_stream, sync_mode_overrides[name])
|
|
else:
|
|
set_sync_modes_from_metadata(airbyte_stream, stream.get("metadata", []))
|
|
|
|
if name in primary_key_overrides:
|
|
airbyte_stream.source_defined_primary_key = [[k] for k in primary_key_overrides[name]]
|
|
elif stream.get("key_properties"):
|
|
airbyte_stream.source_defined_primary_key = [[k] for k in stream["key_properties"]]
|
|
|
|
airbyte_streams += [airbyte_stream]
|
|
return AirbyteCatalog(streams=airbyte_streams)
|
|
|
|
@staticmethod
|
|
def _read_singer_catalog(logger, shell_command: str) -> Mapping[str, Any]:
|
|
completed_process = subprocess.run(
|
|
shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
|
|
)
|
|
for line in completed_process.stderr.splitlines():
|
|
logger.log(*log_by_prefix(line, "ERROR"))
|
|
|
|
return json.loads(completed_process.stdout)
|
|
|
|
@staticmethod
|
|
def get_catalogs(
|
|
logger,
|
|
shell_command: str,
|
|
sync_mode_overrides: Dict[str, SyncModeInfo],
|
|
primary_key_overrides: Dict[str, List[str]],
|
|
excluded_streams: List,
|
|
) -> Catalogs:
|
|
singer_catalog = SingerHelper._read_singer_catalog(logger, shell_command)
|
|
streams = singer_catalog.get("streams", [])
|
|
if streams and excluded_streams:
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'Unsupported target for indexed assignment ("Mapping[str, Any]")'
|
|
# _read_singer_catalog returns Mapping, to fix this error it should be changed to MutableMapping
|
|
# ignored for now
|
|
singer_catalog["streams"] = [stream for stream in streams if stream["stream"] not in excluded_streams] # type: ignore
|
|
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'Argument 1 to "singer_catalog_to_airbyte_catalog" of "SingerHelper" has incompatible type "Mapping[str, Any]"; expected "Dict[str, Any]"'
|
|
# singer_catalog is Mapping, because _read_singer_catalog returns Mapping, but singer_catalog_to_airbyte_catalog expects Dict
|
|
# it's needed to check and fix, ignored for now
|
|
airbyte_catalog = SingerHelper.singer_catalog_to_airbyte_catalog(singer_catalog, sync_mode_overrides, primary_key_overrides) # type: ignore
|
|
return Catalogs(singer_catalog=singer_catalog, airbyte_catalog=airbyte_catalog)
|
|
|
|
@staticmethod
|
|
def read(logger, shell_command, is_message=(lambda x: True)) -> Iterator[AirbyteMessage]:
|
|
with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p:
|
|
for line, text_wrapper in SingerHelper._read_lines(p):
|
|
if text_wrapper is p.stdout:
|
|
out_json = to_json(line)
|
|
if out_json is not None and is_message(out_json):
|
|
message_data = SingerHelper._airbyte_message_from_json(out_json)
|
|
if message_data is not None:
|
|
yield message_data
|
|
else:
|
|
logger.log(*log_by_prefix(line, "INFO"))
|
|
else:
|
|
logger.log(*log_by_prefix(line, "ERROR"))
|
|
|
|
@staticmethod
|
|
def _read_lines(process: subprocess.Popen) -> Iterator[Tuple[str, TextIOWrapper]]:
|
|
sel = selectors.DefaultSelector()
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this two lines
|
|
# 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"'
|
|
# 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"'
|
|
# It's need to check, ignored for now
|
|
sel.register(process.stdout, selectors.EVENT_READ) # type: ignore
|
|
sel.register(process.stderr, selectors.EVENT_READ) # type: ignore
|
|
eof = False
|
|
while not eof:
|
|
selects_list = sel.select()
|
|
empty_line_counter = 0
|
|
for key, _ in selects_list:
|
|
# according to issue CDK: typing errors #9500, mypy raises two errors on these lines
|
|
# 'Item "int" of "Union[int, HasFileno]" has no attribute "readline"'
|
|
# 'Item "HasFileno" of "Union[int, HasFileno]" has no attribute "readline"'
|
|
# It's need to check, ignored for now
|
|
line = key.fileobj.readline() # type: ignore
|
|
if not line:
|
|
empty_line_counter += 1
|
|
if empty_line_counter >= len(selects_list):
|
|
eof = True
|
|
|
|
try:
|
|
process.wait(timeout=60)
|
|
except subprocess.TimeoutExpired:
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior'
|
|
# It's need to fix, ignored for now
|
|
raise Exception(f"Underlying command {process.args} is hanging") # type: ignore
|
|
|
|
if process.returncode != 0:
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior'
|
|
# It's need to fix, ignored for now
|
|
raise Exception(f"Underlying command {process.args} failed with exit code {process.returncode}") # type: ignore
|
|
else:
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'Incompatible types in "yield" (actual type "Tuple[Any, Union[int, HasFileno]]", expected type "Tuple[str, TextIOWrapper]")'
|
|
# It's need to fix, ignored for now
|
|
yield line, key.fileobj # type: ignore
|
|
|
|
@staticmethod
|
|
def _airbyte_message_from_json(transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
|
|
if transformed_json is None or transformed_json.get("type") == "SCHEMA" or transformed_json.get("type") == "ACTIVATE_VERSION":
|
|
return None
|
|
elif transformed_json.get("type") == "STATE":
|
|
out_record = AirbyteStateMessage(data=transformed_json["value"])
|
|
out_message = AirbyteMessage(type=Type.STATE, state=out_record)
|
|
else:
|
|
# todo: check that messages match the discovered schema
|
|
stream_name = transformed_json["stream"]
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# 'Incompatible types in assignment (expression has type "AirbyteRecordMessage", variable has type "AirbyteStateMessage")'
|
|
# type of out_record is first initialized as AirbyteStateMessage on the line 240
|
|
# however AirbyteRecordMessage is assigned on the line below, it causes error
|
|
# ignored
|
|
out_record = AirbyteRecordMessage( # type: ignore
|
|
stream=stream_name,
|
|
data=transformed_json["record"],
|
|
emitted_at=int(datetime.now().timestamp()) * 1000,
|
|
)
|
|
out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
|
|
return out_message
|
|
|
|
@staticmethod
|
|
def create_singer_catalog_with_selection(masked_airbyte_catalog: ConfiguredAirbyteCatalog, discovered_singer_catalog: object) -> str:
|
|
combined_catalog_path = os.path.join("singer_rendered_catalog.json")
|
|
masked_singer_streams = []
|
|
|
|
stream_name_to_configured_stream = {
|
|
configured_stream.stream.name: configured_stream for configured_stream in masked_airbyte_catalog.streams
|
|
}
|
|
|
|
# according to issue CDK: typing errors #9500, mypy raises error on this line
|
|
# '"object" has no attribute "get"'
|
|
# discovered_singer_catalog type is set to object on the line 259, need to check
|
|
# ignored for now
|
|
for singer_stream in discovered_singer_catalog.get("streams"): # type: ignore
|
|
stream_name = singer_stream.get("stream")
|
|
if stream_name in stream_name_to_configured_stream:
|
|
new_metadatas = []
|
|
# support old style catalog.
|
|
singer_stream["schema"]["selected"] = True
|
|
if singer_stream.get("metadata"):
|
|
metadatas = singer_stream.get("metadata")
|
|
for metadata in metadatas:
|
|
new_metadata = metadata
|
|
new_metadata["metadata"]["selected"] = True
|
|
if not is_field_metadata(new_metadata):
|
|
configured_stream = stream_name_to_configured_stream[stream_name]
|
|
if configured_for_incremental(configured_stream):
|
|
replication_method = _INCREMENTAL
|
|
if configured_stream.cursor_field:
|
|
new_metadata["metadata"]["replication-key"] = configured_stream.cursor_field[0]
|
|
else:
|
|
replication_method = _FULL_TABLE
|
|
new_metadata["metadata"]["forced-replication-method"] = replication_method
|
|
new_metadata["metadata"]["replication-method"] = replication_method
|
|
else:
|
|
if "fieldExclusions" in new_metadata["metadata"]:
|
|
new_metadata["metadata"]["selected"] = True if not new_metadata["metadata"]["fieldExclusions"] else False
|
|
new_metadatas += [new_metadata]
|
|
singer_stream["metadata"] = new_metadatas
|
|
|
|
masked_singer_streams += [singer_stream]
|
|
|
|
combined_catalog = {"streams": masked_singer_streams}
|
|
with open(combined_catalog_path, "w") as fh:
|
|
fh.write(json.dumps(combined_catalog))
|
|
|
|
return combined_catalog_path
|