82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
#
|
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
from airbyte_cdk.models import AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode, SyncMode
|
|
from source_file.source import SourceFile
|
|
|
|
HERE = Path(__file__).parent.absolute()
|
|
|
|
|
|
def test_csv_with_utf16_encoding():
|
|
|
|
config_local_csv_utf16 = {
|
|
"dataset_name": "AAA",
|
|
"format": "csv",
|
|
"reader_options": '{"encoding":"utf_16"}',
|
|
"url": f"{HERE}/../integration_tests/sample_files/test_utf16.csv",
|
|
"provider": {"storage": "local"},
|
|
}
|
|
expected_schema = {
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"properties": {
|
|
"header1": {"type": ["string", "null"]},
|
|
"header2": {"type": ["number", "null"]},
|
|
"header3": {"type": ["number", "null"]},
|
|
"header4": {"type": ["boolean", "null"]},
|
|
},
|
|
"type": "object",
|
|
}
|
|
|
|
catalog = SourceFile().discover(logger=logging.getLogger("airbyte"), config=config_local_csv_utf16)
|
|
stream = next(iter(catalog.streams))
|
|
assert stream.json_schema == expected_schema
|
|
|
|
|
|
def get_catalog(properties):
|
|
return ConfiguredAirbyteCatalog(
|
|
streams=[
|
|
ConfiguredAirbyteStream(
|
|
stream=AirbyteStream(
|
|
name="test",
|
|
json_schema={"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": properties},
|
|
),
|
|
sync_mode=SyncMode.full_refresh,
|
|
destination_sync_mode=DestinationSyncMode.overwrite,
|
|
)
|
|
]
|
|
)
|
|
|
|
|
|
def test_nan_to_null():
|
|
"""make sure numpy.nan converted to None"""
|
|
config = {
|
|
"dataset_name": "test",
|
|
"format": "csv",
|
|
"reader_options": json.dumps({"sep": ";"}),
|
|
"url": f"{HERE}/../integration_tests/sample_files/test_nan.csv",
|
|
"provider": {"storage": "local"},
|
|
}
|
|
|
|
catalog = get_catalog(
|
|
{
|
|
"col1": {"type": ["string", "null"]},
|
|
"col2": {"type": ["number", "null"]},
|
|
"col3": {"type": ["number", "null"]},
|
|
}
|
|
)
|
|
|
|
source = SourceFile()
|
|
records = source.read(logger=logging.getLogger("airbyte"), config=config, catalog=catalog)
|
|
records = [r.record.data for r in records]
|
|
assert records == [
|
|
{"col1": "key1", "col2": 1.11, "col3": None},
|
|
{"col1": "key2", "col2": None, "col3": 2.22},
|
|
{"col1": "key3", "col2": None, "col3": None},
|
|
{"col1": "key4", "col2": 3.33, "col3": None},
|
|
]
|