65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
#
|
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
import logging
|
|
import re
|
|
from collections import defaultdict
|
|
from typing import List, Mapping
|
|
|
|
import pendulum
|
|
from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog
|
|
from jsonschema import Draft7Validator, FormatChecker, FormatError, ValidationError
|
|
|
|
# fmt: off
|
|
timestamp_regex = re.compile((r"^\d{4}-\d?\d-\d?\d" # date
|
|
r"(\s|T)" # separator
|
|
r"\d?\d:\d?\d:\d?\d(.\d+)?" # time
|
|
r".*$")) # timezone
|
|
# fmt: on
|
|
|
|
|
|
class CustomFormatChecker(FormatChecker):
|
|
@staticmethod
|
|
def check_datetime(value: str) -> bool:
|
|
valid_format = timestamp_regex.match(value)
|
|
try:
|
|
pendulum.parse(value, strict=False)
|
|
except ValueError:
|
|
valid_time = False
|
|
else:
|
|
valid_time = True
|
|
return valid_format and valid_time
|
|
|
|
def check(self, instance, format):
|
|
if instance is not None and format == "date-time":
|
|
if not self.check_datetime(instance):
|
|
raise FormatError(f"{instance} has invalid datetime format")
|
|
else:
|
|
return super().check(instance, format)
|
|
|
|
|
|
def verify_records_schema(
|
|
records: List[AirbyteRecordMessage], catalog: ConfiguredAirbyteCatalog
|
|
) -> Mapping[str, Mapping[str, ValidationError]]:
|
|
"""Check records against their schemas from the catalog, yield error messages.
|
|
Only first record with error will be yielded for each stream.
|
|
"""
|
|
validators = {}
|
|
for stream in catalog.streams:
|
|
validators[stream.stream.name] = Draft7Validator(stream.stream.json_schema, format_checker=CustomFormatChecker())
|
|
|
|
stream_errors = defaultdict(dict)
|
|
|
|
for record in records:
|
|
validator = validators.get(record.stream)
|
|
if not validator:
|
|
logging.error(f"Record from the {record.stream} stream that is not in the catalog.")
|
|
continue
|
|
|
|
errors = list(validator.iter_errors(record.data))
|
|
for error in errors:
|
|
stream_errors[record.stream][str(error.schema_path)] = error
|
|
|
|
return stream_errors
|