1
0
mirror of synced 2026-01-07 09:05:45 -05:00

Run mypy on airbyte-cdk as part of the build pipeline and fix typing issues in the file-based module (#27790)

* Try running only on modified files

* make a change

* return something with the wrong type

* Revert "return something with the wrong type"

This reverts commit 23b828371e.

* fix typing in file-based

* format

* Mypy

* fix

* leave as Mapping

* Revert "leave as Mapping"

This reverts commit 908f063f70.

* Use Dict

* update

* move dict()

* Revert "move dict()"

This reverts commit fa347a8236.

* Revert "Revert "move dict()""

This reverts commit c9237df2e4.

* Revert "Revert "Revert "move dict()"""

This reverts commit 5ac1616414.

* use Mapping

* point to config file

* comment

* strict = False

* remove --

* Revert "comment"

This reverts commit 6000814a82.

* install types

* install types in same command as mypy runs

* non-interactive

* freeze version

* pydantic plugin

* plugins

* update

* ignore missing import

* Revert "ignore missing import"

This reverts commit 1da7930fb7.

* Install pydantic instead

* fix

* this passes locally

* strict = true

* format

* explicitly import models

* Update

* remove old mypy.ini config

* temporarily disable mypy

* format

* any

* format

* fix tests

* format

* Automated Commit - Formatting Changes

* Revert "temporarily disable mypy"

This reverts commit eb8470fa3f.

* implicit reexport

* update test

* fix mypy

* Automated Commit - Formatting Changes

* fix some errors in tests

* more type fixes

* more fixes

* more

* .

* done with tests

* fix last files

* format

* Update gradle

* change source-stripe

* only run mypy on cdk

* remove strict

* Add more rules

* update

* ignore missing imports

* cast to string

* Allow untyped decorator

* reset to master

* move to the cdk

* derp

* move explicit imports around

* Automated Commit - Formatting Changes

* Revert "move explicit imports around"

This reverts commit 56e306b72f.

* move explicit imports around

* Upgrade mypy version

* point to config file

* Update readme

* Ignore errors in the models module

* Automated Commit - Formatting Changes

* move check to gradle build

* Any

* try checking out master too

* Revert "try checking out master too"

This reverts commit 8a8f3e373c.

* fetch master

* install mypy

* try without origin

* fetch from the script

* checkout master

* ls the branches

* remotes/origin/master

* remove some cruft

* comment

* remove pydantic types

* unpin mypy

* fetch from the script

* Update connectors base too

* modify a non-cdk file to confirm it doesn't get checked by mypy

* run mypy after generateComponentManifestClassFiles

* run from the venv

* pass files as arguments

* update

* fix when running without args

* with subdir

* path

* try without /

* ./

* remove filter

* try resetting

* Revert "try resetting"

This reverts commit 3a54c424de.

* exclude autogen file

* do not use the github action

* works locally

* remove extra fetch

* run on connectors base

* try bad  typing

* Revert "try bad  typing"

This reverts commit 33b512a3e4.

* reset stripe

* Revert "reset stripe"

This reverts commit 28f23fc6dd.

* Revert "Revert "reset stripe""

This reverts commit 5bf5dee371.

* missing return type

* do not ignore the autogen file

* remove extra installs

* run from venv

* Only check files modified on current branch

* Revert "Only check files modified on current branch"

This reverts commit b4b728e654.

* use merge-base

* Revert "use merge-base"

This reverts commit 3136670cbf.

* try with updated mypy

* bump

* run other steps after mypy

* reset task ordering

* run mypy though

* looser config

* tests pass

* fix mypy issues

* type: ignore

* optional

* this is always a bool

* ignore

* fix typing issues

* remove ignore

* remove mapping

* Automated Commit - Formatting Changes

* Revert "remove ignore"

This reverts commit 9ffeeb6cb1.

* update config

---------

Co-authored-by: girarda <girarda@users.noreply.github.com>
Co-authored-by: Joe Bell <joseph.bell@airbyte.io>
This commit is contained in:
Alexandre Girard
2023-07-13 16:55:48 -07:00
committed by GitHub
parent 792878b253
commit 97a353d5c5
38 changed files with 355 additions and 214 deletions

View File

@@ -3,7 +3,7 @@
#
from abc import abstractmethod
from functools import cached_property
from functools import cached_property, lru_cache
from typing import Any, Dict, Iterable, List, Mapping, Optional
from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
from airbyte_cdk.sources.file_based.types import StreamSlice, StreamState
from airbyte_cdk.sources.file_based.types import StreamSlice
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
@@ -69,15 +69,17 @@ class AbstractFileBasedStream(Stream):
def read_records(
self,
sync_mode: SyncMode,
cursor_field: List[str] = None,
cursor_field: Optional[List[str]] = None,
stream_slice: Optional[StreamSlice] = None,
stream_state: Optional[StreamState] = None,
stream_state: Optional[Mapping[str, Any]] = None,
) -> Iterable[Mapping[str, Any]]:
"""
Yield all records from all remote files in `list_files_for_this_sync`.
This method acts as an adapter between the generic Stream interface and the file-based's
stream since file-based streams manage their own states.
"""
if stream_slice is None:
raise ValueError("stream_slice must be set")
return self.read_records_from_slice(stream_slice)
@abstractmethod
@@ -88,7 +90,7 @@ class AbstractFileBasedStream(Stream):
...
def stream_slices(
self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: StreamState = None
self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None
) -> Iterable[Optional[Mapping[str, Any]]]:
"""
This method acts as an adapter between the generic Stream interface and the file-based's
@@ -105,6 +107,7 @@ class AbstractFileBasedStream(Stream):
...
@abstractmethod
@lru_cache(maxsize=None)
def get_json_schema(self) -> Mapping[str, Any]:
"""
Return the JSON Schema for a stream.
@@ -133,7 +136,7 @@ class AbstractFileBasedStream(Stream):
)
@cached_property
def availability_strategy(self):
def availability_strategy(self) -> AvailabilityStrategy:
return self._availability_strategy
@property

View File

@@ -4,7 +4,7 @@
import logging
from datetime import datetime, timedelta
from typing import Iterable, Mapping, Optional
from typing import Iterable, MutableMapping, Optional
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.stream.cursor.file_based_cursor import FileBasedCursor
@@ -16,7 +16,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
def __init__(self, max_history_size: int, days_to_sync_if_history_is_full: Optional[int]):
self._file_to_datetime_history: Mapping[str:datetime] = {}
self._file_to_datetime_history: MutableMapping[str, str] = {}
self._max_history_size = max_history_size
self._time_window_if_history_is_full = timedelta(
days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
@@ -62,7 +62,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
if file.uri in self._file_to_datetime_history:
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
updated_at_from_history = datetime.strptime(self._file_to_datetime_history.get(file.uri), self.DATE_TIME_FORMAT)
updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT)
if file.last_modified < updated_at_from_history:
logger.warning(
f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
@@ -71,6 +71,8 @@ class DefaultFileBasedCursor(FileBasedCursor):
return file.last_modified > updated_at_from_history
return file.last_modified > updated_at_from_history
if self._is_history_full():
if self._initial_earliest_file_in_history is None:
return True
if file.last_modified > self._initial_earliest_file_in_history.last_modified:
# If the history is partial and the file's datetime is strictly greater than the earliest file in the history,
# we should sync it

View File

@@ -5,7 +5,7 @@
import logging
from abc import ABC, abstractmethod
from datetime import datetime
from typing import Any, Iterable, Mapping
from typing import Any, Iterable, MutableMapping
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.types import StreamState
@@ -32,7 +32,7 @@ class FileBasedCursor(ABC):
"""
@abstractmethod
def get_state(self) -> Mapping[str, Any]:
def get_state(self) -> MutableMapping[str, Any]:
"""
Get the state of the cursor.
"""

View File

@@ -6,9 +6,11 @@ import asyncio
import itertools
import traceback
from functools import cache
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
from airbyte_cdk.models import Type as MessageType
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
from airbyte_cdk.sources.file_based.exceptions import (
FileBasedSourceError,
InvalidSchemaError,
@@ -36,7 +38,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
ab_file_name_col = "_ab_source_file_url"
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
def __init__(self, cursor: FileBasedCursor, **kwargs):
def __init__(self, cursor: FileBasedCursor, **kwargs: Any):
super().__init__(**kwargs)
self._cursor = cursor
@@ -45,12 +47,12 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
return self._cursor.get_state()
@state.setter
def state(self, value: MutableMapping[str, Any]):
def state(self, value: MutableMapping[str, Any]) -> None:
"""State setter, accept state serialized by state getter."""
self._cursor.set_initial_state(value)
@property
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
def primary_key(self) -> PrimaryKeyType:
return self.config.primary_key
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
@@ -93,7 +95,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
except StopSyncPerValidationPolicy:
yield AirbyteMessage(
type=Type.LOG,
type=MessageType.LOG,
log=AirbyteLogMessage(
level=Level.WARN,
message=f"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream={self.name} file={file.uri} validation_policy={self.config.validation_policy} n_skipped={n_skipped}",
@@ -103,7 +105,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
except Exception:
yield AirbyteMessage(
type=Type.LOG,
type=MessageType.LOG,
log=AirbyteLogMessage(
level=Level.ERROR,
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
@@ -115,7 +117,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
else:
if n_skipped:
yield AirbyteMessage(
type=Type.LOG,
type=MessageType.LOG,
log=AirbyteLogMessage(
level=Level.WARN,
message=f"Records in file did not pass validation policy. stream={self.name} file={file.uri} n_skipped={n_skipped} validation_policy={self.validation_policy.name}",
@@ -141,12 +143,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
except Exception as exc:
raise SchemaInferenceError(FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name) from exc
else:
schema["properties"] = {**extra_fields, **schema["properties"]}
return schema
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
def _get_raw_json_schema(self) -> JsonSchema:
if self.config.input_schema:
return self.config.input_schema
return self.config.input_schema # type: ignore
elif self.config.schemaless:
return schemaless_schema
else:
@@ -180,7 +181,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
The output of this method is cached so we don't need to list the files more than once.
This means we won't pick up changes to the files during a sync.
"""
return list(self._stream_reader.get_matching_files(self.config.globs))
return list(self._stream_reader.get_matching_files(self.config.globs or []))
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
loop = asyncio.get_event_loop()
@@ -193,13 +194,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
Each file type has a corresponding `infer_schema` handler.
Dispatch on file type.
"""
base_schema: Dict[str, str] = {}
pending_tasks = set()
base_schema: Dict[str, Any] = {}
pending_tasks: Set[asyncio.tasks.Task[Dict[str, Any]]] = set()
n_started, n_files = 0, len(files)
files = iter(files)
files_iterator = iter(files)
while pending_tasks or n_started < n_files:
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files, None)):
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files_iterator, None)):
pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
n_started += 1
# Return when the first task is completed so that we can enqueue a new task as soon as the
@@ -210,7 +211,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
return base_schema
async def _infer_file_schema(self, file: RemoteFile) -> Mapping[str, Any]:
async def _infer_file_schema(self, file: RemoteFile) -> Dict[str, Any]:
try:
return await self.get_parser(self.config.file_type).infer_schema(self.config, file, self._stream_reader, self.logger)
except Exception as exc: