1
0
mirror of synced 2026-01-24 16:01:55 -05:00
Files
airbyte/airbyte-cdk/python/unit_tests/sources/file_based/helpers.py
Alexandre Girard 0aa86cf156 File-based CDK + Source S3 (v4): Pass configured file encoding to stream reader (#29110)
* Add encoding to open_file interface

* pass the encoding set in the config

* cleanup

* cleanup

* Automated Commit - Formatting Changes

* Add missing test

* Automated Commit - Formatting Changes

* Update infer_schema too

* Automated Commit - Formatting Changes

* Update unit test

* add a unit test

* fix

* format

* format

* remove newline

* use a mock

* fix

* format

---------

Co-authored-by: girarda <girarda@users.noreply.github.com>
2023-08-09 09:05:06 -05:00

67 lines
2.5 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import logging
from datetime import datetime
from io import IOBase
from typing import Any, Dict, List, Mapping, Optional
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
from airbyte_cdk.sources.file_based.discovery_policy import DefaultDiscoveryPolicy
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser
from airbyte_cdk.sources.file_based.file_types.jsonl_parser import JsonlParser
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesStreamReader
class EmptySchemaParser(CsvParser):
async def infer_schema(self, config: FileBasedStreamConfig, file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger) -> Dict[str, Any]:
return {}
class LowInferenceLimitDiscoveryPolicy(DefaultDiscoveryPolicy):
@property
def max_n_files_for_schema_inference(self) -> int:
return 1
class LowInferenceBytesJsonlParser(JsonlParser):
MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1
class TestErrorListMatchingFilesInMemoryFilesStreamReader(InMemoryFilesStreamReader):
def get_matching_files(
self,
globs: List[str],
from_date: Optional[datetime] = None,
) -> List[RemoteFile]:
raise Exception("Error listing files")
class TestErrorOpenFileInMemoryFilesStreamReader(InMemoryFilesStreamReader):
def open_file(self, file: RemoteFile, file_read_mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
raise Exception("Error opening file")
class FailingSchemaValidationPolicy(AbstractSchemaValidationPolicy):
ALWAYS_FAIL = "always_fail"
validate_schema_before_sync = True
def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool:
return False
class LowHistoryLimitCursor(DefaultFileBasedCursor):
DEFAULT_MAX_HISTORY_SIZE = 3
def make_remote_files(files: List[str]) -> List[RemoteFile]:
return [
RemoteFile(uri=f, last_modified=datetime.strptime("2023-06-05T03:54:07.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))
for f in files
]