1
0
mirror of synced 2026-01-01 09:02:59 -05:00
Files
airbyte/airbyte-integrations/connectors/source-iterable/source_iterable/slice_generators.py
Daryna Ishchenko 67381f8678 :bug:Source Iterable: fix type error (#28457)
* added __iter__ to StreamSlice

* updated change log
2023-07-20 10:58:28 +03:00

170 lines
6.3 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import dataclasses
import math
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple
import pendulum
from pendulum.datetime import DateTime, Period
@dataclass
class StreamSlice:
start_date: DateTime
end_date: DateTime
def __iter__(self):
return ((field.name, getattr(self, field.name)) for field in dataclasses.fields(self))
class SliceGenerator:
"""
Base class for slice generators.
"""
_start_date: DateTime = None
_end_data: DateTime = None
def __init__(self, start_date: DateTime, end_date: Optional[DateTime] = None):
self._start_date = start_date
self._end_date = end_date or pendulum.now("UTC")
def __iter__(self):
return self
class RangeSliceGenerator(SliceGenerator):
"""
Split slices into event ranges of 90 days (or less for final slice) from
start_date up to current date.
"""
RANGE_LENGTH_DAYS: int = 90
_slices: List[StreamSlice] = []
def __init__(self, start_date: DateTime, end_date: Optional[DateTime] = None):
super().__init__(start_date, end_date)
self._slices = [
StreamSlice(start_date=start, end_date=end)
for start, end in self.make_datetime_ranges(self._start_date, self._end_date, self.RANGE_LENGTH_DAYS)
]
def __next__(self) -> StreamSlice:
if not self._slices:
raise StopIteration()
return self._slices.pop(0)
@staticmethod
def make_datetime_ranges(start: DateTime, end: DateTime, range_days: int) -> Iterable[Tuple[DateTime, DateTime]]:
"""
Generates list of ranges starting from start up to end date with duration of ranges_days.
Args:
start (DateTime): start of the range
end (DateTime): end of the range
range_days (int): Number in days to split subranges into.
Returns:
List[Tuple[DateTime, DateTime]]: list of tuples with ranges.
Each tuple contains two daytime variables: first is period start
and second is period end.
"""
if start > end:
return []
next_start = start
period = pendulum.Duration(days=range_days)
while next_start < end:
next_end = min(next_start + period, end)
yield next_start, next_end
next_start = next_end
class AdjustableSliceGenerator(SliceGenerator):
"""
Generate slices from start_date up to current date. Every next slice could
have different range based on was the previous slice processed successfully
and how much time it took.
The alghorithm is following:
1. First slice have INITIAL_RANGE_DAYS (30 days) length.
2. When slice is processed by stream this class expect "adjust_range"
method to be called with parameter how much time it took to process
previous request
3. Knowing previous slice range we can calculate days per minute processing
speed. Dividing this speed by REQUEST_PER_MINUTE_LIMIT (4) we can calculate
next slice range. Next range cannot be greater than MAX_RANGE_DAYS (180 days)
If processing of previous slice havent been completed "reduce_range" method
should be called. It would reset next range start date to previous slice
and reduce next slice range by RANGE_REDUCE_FACTOR (2 times)
In case if range havent been adjusted before getting next slice (it could
happend if there were no records for given date range), next slice would
have MAX_RANGE_DAYS (180) length.
"""
REQUEST_PER_MINUTE_LIMIT = 4
INITIAL_RANGE_DAYS: int = 30
DEFAULT_RANGE_DAYS: int = 90
MAX_RANGE_DAYS: int = 180
RANGE_REDUCE_FACTOR = 2
# This variable play important roles: stores length of previos range before
# next adjusting next slice lenght and provide length of next slice after
# adjusting
_current_range: int = INITIAL_RANGE_DAYS
# Save previous start date in case if slice processing fail and we need to
# go back to previous range.
_prev_start_date: DateTime = None
# In case if adjust_range method havent been called (no records for slice)
# next range would have MAX_RANGE_DAYS length
# Default is True so for first slice it would length would be INITIAL_RANGE_DAYS (30 days)
_range_adjusted = True
def adjust_range(self, previous_request_time: Period):
"""
Calculate next slice length in days based on previous slice length and
processing time.
"""
minutes_spent = previous_request_time.total_minutes()
if minutes_spent == 0:
self._current_range = self.DEFAULT_RANGE_DAYS
else:
days_per_minute = self._current_range / minutes_spent
next_range = math.floor(days_per_minute / self.REQUEST_PER_MINUTE_LIMIT)
self._current_range = min(next_range or self.DEFAULT_RANGE_DAYS, self.MAX_RANGE_DAYS)
self._range_adjusted = True
def reduce_range(self) -> StreamSlice:
"""
This method is supposed to be called when slice processing failed.
Reset next slice start date to previous one and reduce slice range by
RANGE_REDUCE_FACTOR (2 times).
Returns updated slice to try again.
"""
self._current_range = int(max(self._current_range / self.RANGE_REDUCE_FACTOR, self.INITIAL_RANGE_DAYS))
start_date = self._prev_start_date
end_date = min(self._end_date, start_date + (pendulum.Duration(days=self._current_range)))
self._start_date = end_date
return StreamSlice(start_date=start_date, end_date=end_date)
def __next__(self) -> StreamSlice:
"""
Generates next slice based on prevouis slice processing result. All the
next slice range calculations should be done after calling adjust_range
and reduce_range methods.
"""
if self._start_date >= self._end_date:
raise StopIteration()
if not self._range_adjusted:
self._current_range = self.MAX_RANGE_DAYS
next_start_date = min(self._end_date, self._start_date + pendulum.Duration(days=self._current_range))
slice = StreamSlice(start_date=self._start_date, end_date=next_start_date)
self._prev_start_date = self._start_date
self._start_date = next_start_date
self._range_adjusted = False
return slice