Run mypy on airbyte-cdk as part of the build pipeline and fix typing issues in the file-based module (#27790)
* Try running only on modified files * make a change * return something with the wrong type * Revert "return something with the wrong type" This reverts commit23b828371e. * fix typing in file-based * format * Mypy * fix * leave as Mapping * Revert "leave as Mapping" This reverts commit908f063f70. * Use Dict * update * move dict() * Revert "move dict()" This reverts commitfa347a8236. * Revert "Revert "move dict()"" This reverts commitc9237df2e4. * Revert "Revert "Revert "move dict()""" This reverts commit5ac1616414. * use Mapping * point to config file * comment * strict = False * remove -- * Revert "comment" This reverts commit6000814a82. * install types * install types in same command as mypy runs * non-interactive * freeze version * pydantic plugin * plugins * update * ignore missing import * Revert "ignore missing import" This reverts commit1da7930fb7. * Install pydantic instead * fix * this passes locally * strict = true * format * explicitly import models * Update * remove old mypy.ini config * temporarily disable mypy * format * any * format * fix tests * format * Automated Commit - Formatting Changes * Revert "temporarily disable mypy" This reverts commiteb8470fa3f. * implicit reexport * update test * fix mypy * Automated Commit - Formatting Changes * fix some errors in tests * more type fixes * more fixes * more * . * done with tests * fix last files * format * Update gradle * change source-stripe * only run mypy on cdk * remove strict * Add more rules * update * ignore missing imports * cast to string * Allow untyped decorator * reset to master * move to the cdk * derp * move explicit imports around * Automated Commit - Formatting Changes * Revert "move explicit imports around" This reverts commit56e306b72f. * move explicit imports around * Upgrade mypy version * point to config file * Update readme * Ignore errors in the models module * Automated Commit - Formatting Changes * move check to gradle build * Any * try checking out master too * Revert "try checking out master too" This reverts commit8a8f3e373c. * fetch master * install mypy * try without origin * fetch from the script * checkout master * ls the branches * remotes/origin/master * remove some cruft * comment * remove pydantic types * unpin mypy * fetch from the script * Update connectors base too * modify a non-cdk file to confirm it doesn't get checked by mypy * run mypy after generateComponentManifestClassFiles * run from the venv * pass files as arguments * update * fix when running without args * with subdir * path * try without / * ./ * remove filter * try resetting * Revert "try resetting" This reverts commit3a54c424de. * exclude autogen file * do not use the github action * works locally * remove extra fetch * run on connectors base * try bad typing * Revert "try bad typing" This reverts commit33b512a3e4. * reset stripe * Revert "reset stripe" This reverts commit28f23fc6dd. * Revert "Revert "reset stripe"" This reverts commit5bf5dee371. * missing return type * do not ignore the autogen file * remove extra installs * run from venv * Only check files modified on current branch * Revert "Only check files modified on current branch" This reverts commitb4b728e654. * use merge-base * Revert "use merge-base" This reverts commit3136670cbf. * try with updated mypy * bump * run other steps after mypy * reset task ordering * run mypy though * looser config * tests pass * fix mypy issues * type: ignore * optional * this is always a bool * ignore * fix typing issues * remove ignore * remove mapping * Automated Commit - Formatting Changes * Revert "remove ignore" This reverts commit9ffeeb6cb1. * update config --------- Co-authored-by: girarda <girarda@users.noreply.github.com> Co-authored-by: Joe Bell <joseph.bell@airbyte.io>
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
from abc import abstractmethod
|
||||
from functools import cached_property
|
||||
from functools import cached_property, lru_cache
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode
|
||||
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
||||
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
||||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
||||
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
||||
from airbyte_cdk.sources.file_based.types import StreamSlice, StreamState
|
||||
from airbyte_cdk.sources.file_based.types import StreamSlice
|
||||
from airbyte_cdk.sources.streams import Stream
|
||||
from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
|
||||
|
||||
@@ -69,15 +69,17 @@ class AbstractFileBasedStream(Stream):
|
||||
def read_records(
|
||||
self,
|
||||
sync_mode: SyncMode,
|
||||
cursor_field: List[str] = None,
|
||||
cursor_field: Optional[List[str]] = None,
|
||||
stream_slice: Optional[StreamSlice] = None,
|
||||
stream_state: Optional[StreamState] = None,
|
||||
stream_state: Optional[Mapping[str, Any]] = None,
|
||||
) -> Iterable[Mapping[str, Any]]:
|
||||
"""
|
||||
Yield all records from all remote files in `list_files_for_this_sync`.
|
||||
This method acts as an adapter between the generic Stream interface and the file-based's
|
||||
stream since file-based streams manage their own states.
|
||||
"""
|
||||
if stream_slice is None:
|
||||
raise ValueError("stream_slice must be set")
|
||||
return self.read_records_from_slice(stream_slice)
|
||||
|
||||
@abstractmethod
|
||||
@@ -88,7 +90,7 @@ class AbstractFileBasedStream(Stream):
|
||||
...
|
||||
|
||||
def stream_slices(
|
||||
self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: StreamState = None
|
||||
self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None
|
||||
) -> Iterable[Optional[Mapping[str, Any]]]:
|
||||
"""
|
||||
This method acts as an adapter between the generic Stream interface and the file-based's
|
||||
@@ -105,6 +107,7 @@ class AbstractFileBasedStream(Stream):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
@lru_cache(maxsize=None)
|
||||
def get_json_schema(self) -> Mapping[str, Any]:
|
||||
"""
|
||||
Return the JSON Schema for a stream.
|
||||
@@ -133,7 +136,7 @@ class AbstractFileBasedStream(Stream):
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def availability_strategy(self):
|
||||
def availability_strategy(self) -> AvailabilityStrategy:
|
||||
return self._availability_strategy
|
||||
|
||||
@property
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Iterable, Mapping, Optional
|
||||
from typing import Iterable, MutableMapping, Optional
|
||||
|
||||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
||||
from airbyte_cdk.sources.file_based.stream.cursor.file_based_cursor import FileBasedCursor
|
||||
@@ -16,7 +16,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
||||
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
||||
|
||||
def __init__(self, max_history_size: int, days_to_sync_if_history_is_full: Optional[int]):
|
||||
self._file_to_datetime_history: Mapping[str:datetime] = {}
|
||||
self._file_to_datetime_history: MutableMapping[str, str] = {}
|
||||
self._max_history_size = max_history_size
|
||||
self._time_window_if_history_is_full = timedelta(
|
||||
days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
||||
@@ -62,7 +62,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
||||
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
||||
if file.uri in self._file_to_datetime_history:
|
||||
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
|
||||
updated_at_from_history = datetime.strptime(self._file_to_datetime_history.get(file.uri), self.DATE_TIME_FORMAT)
|
||||
updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT)
|
||||
if file.last_modified < updated_at_from_history:
|
||||
logger.warning(
|
||||
f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
|
||||
@@ -71,6 +71,8 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
||||
return file.last_modified > updated_at_from_history
|
||||
return file.last_modified > updated_at_from_history
|
||||
if self._is_history_full():
|
||||
if self._initial_earliest_file_in_history is None:
|
||||
return True
|
||||
if file.last_modified > self._initial_earliest_file_in_history.last_modified:
|
||||
# If the history is partial and the file's datetime is strictly greater than the earliest file in the history,
|
||||
# we should sync it
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime
|
||||
from typing import Any, Iterable, Mapping
|
||||
from typing import Any, Iterable, MutableMapping
|
||||
|
||||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
||||
from airbyte_cdk.sources.file_based.types import StreamState
|
||||
@@ -32,7 +32,7 @@ class FileBasedCursor(ABC):
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_state(self) -> Mapping[str, Any]:
|
||||
def get_state(self) -> MutableMapping[str, Any]:
|
||||
"""
|
||||
Get the state of the cursor.
|
||||
"""
|
||||
|
||||
@@ -6,9 +6,11 @@ import asyncio
|
||||
import itertools
|
||||
import traceback
|
||||
from functools import cache
|
||||
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
|
||||
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
|
||||
|
||||
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
|
||||
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
|
||||
from airbyte_cdk.models import Type as MessageType
|
||||
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
||||
from airbyte_cdk.sources.file_based.exceptions import (
|
||||
FileBasedSourceError,
|
||||
InvalidSchemaError,
|
||||
@@ -36,7 +38,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
ab_file_name_col = "_ab_source_file_url"
|
||||
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
||||
|
||||
def __init__(self, cursor: FileBasedCursor, **kwargs):
|
||||
def __init__(self, cursor: FileBasedCursor, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self._cursor = cursor
|
||||
|
||||
@@ -45,12 +47,12 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
return self._cursor.get_state()
|
||||
|
||||
@state.setter
|
||||
def state(self, value: MutableMapping[str, Any]):
|
||||
def state(self, value: MutableMapping[str, Any]) -> None:
|
||||
"""State setter, accept state serialized by state getter."""
|
||||
self._cursor.set_initial_state(value)
|
||||
|
||||
@property
|
||||
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
|
||||
def primary_key(self) -> PrimaryKeyType:
|
||||
return self.config.primary_key
|
||||
|
||||
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
||||
@@ -93,7 +95,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
|
||||
except StopSyncPerValidationPolicy:
|
||||
yield AirbyteMessage(
|
||||
type=Type.LOG,
|
||||
type=MessageType.LOG,
|
||||
log=AirbyteLogMessage(
|
||||
level=Level.WARN,
|
||||
message=f"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream={self.name} file={file.uri} validation_policy={self.config.validation_policy} n_skipped={n_skipped}",
|
||||
@@ -103,7 +105,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
|
||||
except Exception:
|
||||
yield AirbyteMessage(
|
||||
type=Type.LOG,
|
||||
type=MessageType.LOG,
|
||||
log=AirbyteLogMessage(
|
||||
level=Level.ERROR,
|
||||
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
||||
@@ -115,7 +117,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
else:
|
||||
if n_skipped:
|
||||
yield AirbyteMessage(
|
||||
type=Type.LOG,
|
||||
type=MessageType.LOG,
|
||||
log=AirbyteLogMessage(
|
||||
level=Level.WARN,
|
||||
message=f"Records in file did not pass validation policy. stream={self.name} file={file.uri} n_skipped={n_skipped} validation_policy={self.validation_policy.name}",
|
||||
@@ -141,12 +143,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
except Exception as exc:
|
||||
raise SchemaInferenceError(FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name) from exc
|
||||
else:
|
||||
schema["properties"] = {**extra_fields, **schema["properties"]}
|
||||
return schema
|
||||
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
|
||||
|
||||
def _get_raw_json_schema(self) -> JsonSchema:
|
||||
if self.config.input_schema:
|
||||
return self.config.input_schema
|
||||
return self.config.input_schema # type: ignore
|
||||
elif self.config.schemaless:
|
||||
return schemaless_schema
|
||||
else:
|
||||
@@ -180,7 +181,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
The output of this method is cached so we don't need to list the files more than once.
|
||||
This means we won't pick up changes to the files during a sync.
|
||||
"""
|
||||
return list(self._stream_reader.get_matching_files(self.config.globs))
|
||||
return list(self._stream_reader.get_matching_files(self.config.globs or []))
|
||||
|
||||
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
||||
loop = asyncio.get_event_loop()
|
||||
@@ -193,13 +194,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
Each file type has a corresponding `infer_schema` handler.
|
||||
Dispatch on file type.
|
||||
"""
|
||||
base_schema: Dict[str, str] = {}
|
||||
pending_tasks = set()
|
||||
base_schema: Dict[str, Any] = {}
|
||||
pending_tasks: Set[asyncio.tasks.Task[Dict[str, Any]]] = set()
|
||||
|
||||
n_started, n_files = 0, len(files)
|
||||
files = iter(files)
|
||||
files_iterator = iter(files)
|
||||
while pending_tasks or n_started < n_files:
|
||||
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files, None)):
|
||||
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files_iterator, None)):
|
||||
pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
|
||||
n_started += 1
|
||||
# Return when the first task is completed so that we can enqueue a new task as soon as the
|
||||
@@ -210,7 +211,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
||||
|
||||
return base_schema
|
||||
|
||||
async def _infer_file_schema(self, file: RemoteFile) -> Mapping[str, Any]:
|
||||
async def _infer_file_schema(self, file: RemoteFile) -> Dict[str, Any]:
|
||||
try:
|
||||
return await self.get_parser(self.config.file_type).infer_schema(self.config, file, self._stream_reader, self.logger)
|
||||
except Exception as exc:
|
||||
|
||||
Reference in New Issue
Block a user