1
0
mirror of synced 2025-12-31 15:03:11 -05:00
Files
airbyte/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py
Joe Reuter c53d1fa29d Datetime inferrer: Improve detected formats (#27546)
* consolidate formats

* Automated Commit - Formatting Changes

* consolidate formats

* consolidate formats

---------

Co-authored-by: flash1293 <flash1293@users.noreply.github.com>
2023-06-23 05:23:33 -04:00

81 lines
3.4 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
from typing import Any, Dict, Union
from airbyte_cdk.models import AirbyteRecordMessage
from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser
class DatetimeFormatInferrer:
"""
This class is used to detect toplevel fields in records that might be datetime values, along with the used format.
"""
def __init__(self):
self._parser = DatetimeParser()
self._datetime_candidates: Union[None, Dict[str, str]] = None
self._formats = [
"%Y-%m-%d",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%S.%f%z",
"%Y-%m-%dT%H:%M:%S.%f%z",
"%s",
"%d/%m/%Y %H:%M",
"%Y-%m",
"%d-%m-%Y",
"%Y-%m-%dT%H:%M:%SZ",
]
self._timestamp_heuristic_range = range(1_000_000_000, 2_000_000_000)
def _can_be_datetime(self, value: Any) -> bool:
"""Checks if the value can be a datetime. This is the case if the value is a string or an integer between 1_000_000_000 and 2_000_000_000. This is separate from the format check for performance reasons"""
if isinstance(value, str) and (not value.isdecimal() or int(value) in self._timestamp_heuristic_range):
return True
if isinstance(value, int) and value in self._timestamp_heuristic_range:
return True
return False
def _matches_format(self, value: Any, format: str) -> bool:
"""Checks if the value matches the format"""
try:
self._parser.parse(value, format)
return True
except ValueError:
return False
def _initialize(self, record: AirbyteRecordMessage):
"""Initializes the internal state of the class"""
self._datetime_candidates = {}
for field_name, field_value in record.data.items():
if not self._can_be_datetime(field_value):
continue
for format in self._formats:
if self._matches_format(field_value, format):
self._datetime_candidates[field_name] = format
break
def _validate(self, record: AirbyteRecordMessage):
"""Validates that the record is consistent with the inferred datetime formats"""
for candidate_field_name in list(self._datetime_candidates.keys()):
candidate_field_format = self._datetime_candidates[candidate_field_name]
current_value = record.data.get(candidate_field_name, None)
if (
current_value is None
or not self._can_be_datetime(current_value)
or not self._matches_format(current_value, candidate_field_format)
):
self._datetime_candidates.pop(candidate_field_name)
def accumulate(self, record: AirbyteRecordMessage):
"""Analyzes the record and updates the internal state of candidate datetime fields"""
self._initialize(record) if self._datetime_candidates is None else self._validate(record)
def get_inferred_datetime_formats(self) -> Dict[str, str]:
"""
Returns the list of candidate datetime fields - the keys are the field names and the values are the inferred datetime formats.
For these fields the format was consistent across all visited records.
"""
return self._datetime_candidates or {}