[file-based cdk] add excel file type support (#43346)
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
#
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
|
||||
|
||||
class ExcelFormat(BaseModel):
|
||||
class Config(OneOfOptionConfig):
|
||||
title = "Excel Format"
|
||||
discriminator = "filetype"
|
||||
|
||||
filetype: str = Field(
|
||||
"excel",
|
||||
const=True,
|
||||
)
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
from enum import Enum
|
||||
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping, Optional, Union
|
||||
|
||||
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
||||
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
|
||||
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
|
||||
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
|
||||
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
|
||||
from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
|
||||
@@ -55,7 +56,7 @@ class FileBasedStreamConfig(BaseModel):
|
||||
description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
|
||||
default=3,
|
||||
)
|
||||
format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat] = Field(
|
||||
format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat] = Field(
|
||||
title="Format",
|
||||
description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Any, Mapping, Type
|
||||
|
||||
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
||||
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
|
||||
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
|
||||
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
|
||||
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
|
||||
@@ -8,6 +9,7 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import Unstructur
|
||||
|
||||
from .avro_parser import AvroParser
|
||||
from .csv_parser import CsvParser
|
||||
from .excel_parser import ExcelParser
|
||||
from .file_type_parser import FileTypeParser
|
||||
from .jsonl_parser import JsonlParser
|
||||
from .parquet_parser import ParquetParser
|
||||
@@ -16,9 +18,10 @@ from .unstructured_parser import UnstructuredParser
|
||||
default_parsers: Mapping[Type[Any], FileTypeParser] = {
|
||||
AvroFormat: AvroParser(),
|
||||
CsvFormat: CsvParser(),
|
||||
ExcelFormat: ExcelParser(),
|
||||
JsonlFormat: JsonlParser(),
|
||||
ParquetFormat: ParquetParser(),
|
||||
UnstructuredFormat: UnstructuredParser(),
|
||||
}
|
||||
|
||||
__all__ = ["AvroParser", "CsvParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
|
||||
__all__ = ["AvroParser", "CsvParser", "ExcelParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
#
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
import logging
|
||||
from io import IOBase
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig
|
||||
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
|
||||
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
||||
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
||||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
||||
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
||||
from numpy import datetime64
|
||||
from numpy import dtype as dtype_
|
||||
from numpy import issubdtype
|
||||
from pydantic.v1 import BaseModel
|
||||
|
||||
|
||||
class ExcelParser(FileTypeParser):
|
||||
ENCODING = None
|
||||
|
||||
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
ExcelParser does not require config checks, implicit pydantic validation is enough.
|
||||
"""
|
||||
return True, None
|
||||
|
||||
async def infer_schema(
|
||||
self,
|
||||
config: FileBasedStreamConfig,
|
||||
file: RemoteFile,
|
||||
stream_reader: AbstractFileBasedStreamReader,
|
||||
logger: logging.Logger,
|
||||
) -> SchemaType:
|
||||
"""
|
||||
Infers the schema of the Excel file by examining its contents.
|
||||
|
||||
Args:
|
||||
config (FileBasedStreamConfig): Configuration for the file-based stream.
|
||||
file (RemoteFile): The remote file to be read.
|
||||
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
|
||||
logger (logging.Logger): Logger for logging information and errors.
|
||||
|
||||
Returns:
|
||||
SchemaType: Inferred schema of the Excel file.
|
||||
"""
|
||||
|
||||
# Validate the format of the config
|
||||
self.validate_format(config.format, logger)
|
||||
|
||||
fields: Dict[str, str] = {}
|
||||
|
||||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
||||
df = self.open_and_parse_file(fp)
|
||||
for column, df_type in df.dtypes.items():
|
||||
# Choose the broadest data type if the column's data type differs in dataframes
|
||||
prev_frame_column_type = fields.get(column)
|
||||
fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type)
|
||||
|
||||
schema = {
|
||||
field: ({"type": "string", "format": "date-time"} if fields[field] == "date-time" else {"type": fields[field]})
|
||||
for field in fields
|
||||
}
|
||||
return schema
|
||||
|
||||
def parse_records(
|
||||
self,
|
||||
config: FileBasedStreamConfig,
|
||||
file: RemoteFile,
|
||||
stream_reader: AbstractFileBasedStreamReader,
|
||||
logger: logging.Logger,
|
||||
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
|
||||
) -> Iterable[Dict[str, Any]]:
|
||||
"""
|
||||
Parses records from an Excel file based on the provided configuration.
|
||||
|
||||
Args:
|
||||
config (FileBasedStreamConfig): Configuration for the file-based stream.
|
||||
file (RemoteFile): The remote file to be read.
|
||||
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
|
||||
logger (logging.Logger): Logger for logging information and errors.
|
||||
discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation.
|
||||
|
||||
Yields:
|
||||
Iterable[Dict[str, Any]]: Parsed records from the Excel file.
|
||||
"""
|
||||
|
||||
# Validate the format of the config
|
||||
self.validate_format(config.format, logger)
|
||||
|
||||
try:
|
||||
# Open and parse the file using the stream reader
|
||||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
||||
df = self.open_and_parse_file(fp)
|
||||
# Yield records as dictionaries
|
||||
yield from df.to_dict(orient="records")
|
||||
|
||||
except Exception as exc:
|
||||
# Raise a RecordParseError if any exception occurs during parsing
|
||||
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) from exc
|
||||
|
||||
@property
|
||||
def file_read_mode(self) -> FileReadMode:
|
||||
"""
|
||||
Returns the file read mode for the Excel file.
|
||||
|
||||
Returns:
|
||||
FileReadMode: The file read mode (binary).
|
||||
"""
|
||||
return FileReadMode.READ_BINARY
|
||||
|
||||
@staticmethod
|
||||
def dtype_to_json_type(current_type: Optional[str], dtype: dtype_) -> str:
|
||||
"""
|
||||
Convert Pandas DataFrame types to Airbyte Types.
|
||||
|
||||
Args:
|
||||
current_type (Optional[str]): One of the previous types based on earlier dataframes.
|
||||
dtype: Pandas DataFrame type.
|
||||
|
||||
Returns:
|
||||
str: Corresponding Airbyte Type.
|
||||
"""
|
||||
number_types = ("int64", "float64")
|
||||
if current_type == "string":
|
||||
# Previous column values were of the string type, no need to look further.
|
||||
return current_type
|
||||
if dtype == object:
|
||||
return "string"
|
||||
if dtype in number_types and (not current_type or current_type == "number"):
|
||||
return "number"
|
||||
if dtype == "bool" and (not current_type or current_type == "boolean"):
|
||||
return "boolean"
|
||||
if issubdtype(dtype, datetime64):
|
||||
return "date-time"
|
||||
return "string"
|
||||
|
||||
@staticmethod
|
||||
def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
|
||||
"""
|
||||
Validates if the given format is of type ExcelFormat.
|
||||
|
||||
Args:
|
||||
excel_format (Any): The format to be validated.
|
||||
|
||||
Raises:
|
||||
ConfigValidationError: If the format is not ExcelFormat.
|
||||
"""
|
||||
if not isinstance(excel_format, ExcelFormat):
|
||||
logger.info(f"Expected ExcelFormat, got {excel_format}")
|
||||
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
|
||||
|
||||
@staticmethod
|
||||
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
|
||||
"""
|
||||
Opens and parses the Excel file.
|
||||
|
||||
Args:
|
||||
fp: File pointer to the Excel file.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Parsed data from the Excel file.
|
||||
"""
|
||||
return pd.ExcelFile(fp, engine="calamine").parse()
|
||||
1685
airbyte-cdk/python/poetry.lock
generated
1685
airbyte-cdk/python/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -50,13 +50,16 @@ langchain = { version = "0.1.16", optional = true }
|
||||
langchain_core = { version = "0.1.42", optional = true }
|
||||
markdown = { version = "*", optional = true }
|
||||
openai = { version = "0.27.9", extras = ["embeddings"], optional = true }
|
||||
pandas = { version = "2.2.0", optional = true }
|
||||
pdf2image = { version = "1.16.3", optional = true }
|
||||
"pdfminer.six" = { version = "20221105", optional = true }
|
||||
pyarrow = { version = "~15.0.0", optional = true }
|
||||
pytesseract = { version = "0.3.10", optional = true }
|
||||
python-calamine = { version = "0.2.3", optional = true }
|
||||
Sphinx = { version = "~4.2", optional = true }
|
||||
sphinx-rtd-theme = { version = "~1.0", optional = true }
|
||||
tiktoken = { version = "0.4.0", optional = true }
|
||||
nltk = { version = "3.8.1", optional = true }
|
||||
unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true }
|
||||
"unstructured.pytesseract" = { version = ">=0.3.12", optional = true }
|
||||
pyjwt = "^2.8.0"
|
||||
@@ -66,7 +69,7 @@ pytz = "2024.1"
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
freezegun = "*"
|
||||
mypy = "*"
|
||||
pandas = "2.0.3"
|
||||
asyncio = "3.4.3"
|
||||
poethepoet = "^0.24.2"
|
||||
pyproject-flake8 = "^6.1.0"
|
||||
pytest = "^7"
|
||||
@@ -77,7 +80,7 @@ pytest-mock = "*"
|
||||
requests-mock = "*"
|
||||
|
||||
[tool.poetry.extras]
|
||||
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown"]
|
||||
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "pandas"]
|
||||
sphinx-docs = ["Sphinx", "sphinx-rtd-theme"]
|
||||
vector-db-based = ["langchain", "openai", "cohere", "tiktoken"]
|
||||
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
#
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
|
||||
import datetime
|
||||
from io import BytesIO
|
||||
from unittest.mock import MagicMock, Mock, mock_open, patch
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig, ValidationPolicy
|
||||
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, RecordParseError
|
||||
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
||||
from airbyte_cdk.sources.file_based.file_types.excel_parser import ExcelParser
|
||||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
||||
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_stream_reader():
|
||||
return Mock(spec=AbstractFileBasedStreamReader)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_logger():
|
||||
return Mock()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def file_config():
|
||||
return FileBasedStreamConfig(
|
||||
name="test.xlsx",
|
||||
file_type="excel",
|
||||
format=ExcelFormat(sheet_name="Sheet1"),
|
||||
validation_policy=ValidationPolicy.emit_record,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def remote_file():
|
||||
return RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_parser(remote_file):
|
||||
parser = ExcelParser()
|
||||
|
||||
# Sample data for the mock Excel file
|
||||
data = pd.DataFrame({
|
||||
"column1": [1, 2, 3],
|
||||
"column2": ["a", "b", "c"],
|
||||
"column3": [True, False, True],
|
||||
"column4": pd.to_datetime(["2021-01-01", "2022-01-01", "2023-01-01"]),
|
||||
})
|
||||
|
||||
# Convert the DataFrame to an Excel byte stream
|
||||
excel_bytes = BytesIO()
|
||||
with pd.ExcelWriter(excel_bytes, engine="xlsxwriter") as writer:
|
||||
data.to_excel(writer, index=False)
|
||||
excel_bytes.seek(0)
|
||||
|
||||
# Mock the stream_reader's open_file method to return the Excel byte stream
|
||||
stream_reader = MagicMock(spec=AbstractFileBasedStreamReader)
|
||||
stream_reader.open_file.return_value = BytesIO(excel_bytes.read())
|
||||
|
||||
return parser, FileBasedStreamConfig(name="test_stream", format=ExcelFormat()), remote_file, stream_reader, MagicMock(), data
|
||||
|
||||
|
||||
@patch("pandas.ExcelFile")
|
||||
@pytest.mark.asyncio
|
||||
async def test_infer_schema(mock_excel_file, setup_parser):
|
||||
parser, config, file, stream_reader, logger, data = setup_parser
|
||||
|
||||
# Mock the parse method of the pandas ExcelFile object
|
||||
mock_excel_file.return_value.parse.return_value = data
|
||||
|
||||
# Call infer_schema
|
||||
schema = await parser.infer_schema(config, file, stream_reader, logger)
|
||||
|
||||
# Define the expected schema
|
||||
expected_schema: SchemaType = {
|
||||
"column1": {"type": "number"},
|
||||
"column2": {"type": "string"},
|
||||
"column3": {"type": "boolean"},
|
||||
"column4": {"type": "string", "format": "date-time"},
|
||||
}
|
||||
|
||||
# Validate the schema
|
||||
assert schema == expected_schema
|
||||
|
||||
# Assert that the stream_reader's open_file was called correctly
|
||||
stream_reader.open_file.assert_called_once_with(file, parser.file_read_mode, parser.ENCODING, logger)
|
||||
|
||||
# Assert that the logger was not used for warnings/errors
|
||||
logger.info.assert_not_called()
|
||||
logger.error.assert_not_called()
|
||||
|
||||
|
||||
def test_invalid_format(mock_stream_reader, mock_logger, remote_file):
|
||||
parser = ExcelParser()
|
||||
invalid_config = FileBasedStreamConfig(
|
||||
name="test.xlsx",
|
||||
file_type="csv",
|
||||
format={"filetype": "csv"},
|
||||
validation_policy=ValidationPolicy.emit_record,
|
||||
)
|
||||
|
||||
with pytest.raises(ConfigValidationError):
|
||||
list(parser.parse_records(invalid_config, remote_file, mock_stream_reader, mock_logger))
|
||||
|
||||
|
||||
def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_file):
|
||||
parser = ExcelParser()
|
||||
with patch("builtins.open", mock_open(read_data=b"corrupted data")):
|
||||
with patch("pandas.ExcelFile") as mock_excel:
|
||||
mock_excel.return_value.parse.side_effect = ValueError("Failed to parse file")
|
||||
|
||||
with pytest.raises(RecordParseError):
|
||||
list(parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger))
|
||||
@@ -209,3 +209,23 @@ class TemporaryAvroFilesStreamReader(InMemoryFilesStreamReader):
|
||||
file_writer.flush()
|
||||
fp.seek(0)
|
||||
return fp.read()
|
||||
|
||||
|
||||
class TemporaryExcelFilesStreamReader(InMemoryFilesStreamReader):
|
||||
"""
|
||||
A file reader that writes RemoteFiles to a temporary file and then reads them back.
|
||||
"""
|
||||
|
||||
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
|
||||
return io.BytesIO(self._make_file_contents(file.uri))
|
||||
|
||||
def _make_file_contents(self, file_name: str) -> bytes:
|
||||
contents = self.files[file_name]["contents"]
|
||||
df = pd.DataFrame(contents)
|
||||
|
||||
with io.BytesIO() as fp:
|
||||
writer = pd.ExcelWriter(fp, engine='xlsxwriter')
|
||||
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
writer._save()
|
||||
fp.seek(0)
|
||||
return fp.read()
|
||||
|
||||
@@ -411,6 +411,14 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
||||
"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
|
||||
"required": ["filetype"],
|
||||
},
|
||||
{
|
||||
"title": "Excel Format",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filetype": {"title": "Filetype", "default": "excel", "const": "excel", "type": "string"}
|
||||
},
|
||||
"required": ["filetype"]
|
||||
}
|
||||
],
|
||||
},
|
||||
"schemaless": {
|
||||
|
||||
@@ -0,0 +1,424 @@
|
||||
#
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
import datetime
|
||||
|
||||
from unit_tests.sources.file_based.in_memory_files_source import TemporaryExcelFilesStreamReader
|
||||
from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
|
||||
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
|
||||
|
||||
_single_excel_file = {
|
||||
"a.xlsx": {
|
||||
"contents": [
|
||||
{"col1": "val11", "col2": "val12"},
|
||||
{"col1": "val21", "col2": "val22"},
|
||||
],
|
||||
"last_modified": "2023-06-05T03:54:07.000Z",
|
||||
}
|
||||
}
|
||||
|
||||
_multiple_excel_combine_schema_file = {
|
||||
"a.xlsx": {
|
||||
"contents": [
|
||||
{"col_double": 20.02, "col_string": "Robbers", "col_album": "The 1975"},
|
||||
{"col_double": 20.23, "col_string": "Somebody Else", "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"},
|
||||
],
|
||||
"last_modified": "2023-06-05T03:54:07.000Z",
|
||||
},
|
||||
"b.xlsx": {
|
||||
"contents": [
|
||||
{"col_double": 1975.1975, "col_string": "It's Not Living (If It's Not with You)", "col_song": "Love It If We Made It"},
|
||||
{"col_double": 5791.5791, "col_string": "The 1975", "col_song": "About You"},
|
||||
],
|
||||
"last_modified": "2023-06-06T03:54:07.000Z",
|
||||
},
|
||||
}
|
||||
|
||||
_excel_all_types_file = {
|
||||
"a.xlsx": {
|
||||
"contents": [
|
||||
{
|
||||
"col_bool": True,
|
||||
"col_int": 27,
|
||||
"col_long": 1992,
|
||||
"col_float": 999.09723456,
|
||||
"col_string": "Love It If We Made It",
|
||||
"col_date": datetime.date(2022, 5, 29),
|
||||
"col_time_millis": datetime.time(6, 0, 0, 456000),
|
||||
"col_time_micros": datetime.time(12, 0, 0, 456789),
|
||||
}
|
||||
],
|
||||
"last_modified": "2023-06-05T03:54:07.000Z",
|
||||
}
|
||||
}
|
||||
|
||||
_multiple_excel_stream_file = {
|
||||
"odesza_songs.xlsx": {
|
||||
"contents": [
|
||||
{"col_title": "Late Night", "col_album": "A_MOMENT_APART", "col_year": 2017, "col_vocals": False},
|
||||
{"col_title": "White Lies", "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True},
|
||||
{"col_title": "Wide Awake", "col_album": "THE_LAST_GOODBYE", "col_year": 2022, "col_vocals": True},
|
||||
],
|
||||
"last_modified": "2023-06-05T03:54:07.000Z"
|
||||
},
|
||||
"california_festivals.xlsx": {
|
||||
"contents": [
|
||||
{"col_name": "Lightning in a Bottle", "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, "col_attendance": 18000},
|
||||
{"col_name": "Outside Lands", "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, "col_attendance": 220000},
|
||||
],
|
||||
"last_modified": "2023-06-06T03:54:07.000Z"
|
||||
},
|
||||
}
|
||||
|
||||
single_excel_scenario = (
|
||||
TestScenarioBuilder()
|
||||
.set_name("single_excel_stream")
|
||||
.set_config(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"name": "stream1",
|
||||
"format": {"filetype": "excel"},
|
||||
"globs": ["*"],
|
||||
"validation_policy": "Emit Record",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
.set_source_builder(
|
||||
FileBasedSourceBuilder()
|
||||
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_single_excel_file, file_type="excel"))
|
||||
.set_file_type("excel")
|
||||
)
|
||||
.set_expected_check_status("SUCCEEDED")
|
||||
.set_expected_records(
|
||||
[
|
||||
{
|
||||
"data": {
|
||||
"col1": "val11",
|
||||
"col2": "val12",
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "a.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col1": "val21",
|
||||
"col2": "val22",
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "a.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
]
|
||||
)
|
||||
.set_expected_catalog(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"default_cursor_field": ["_ab_source_file_last_modified"],
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"col1": {"type": ["null", "string"]},
|
||||
"col2": {"type": ["null", "string"]},
|
||||
"_ab_source_file_last_modified": {"type": "string"},
|
||||
"_ab_source_file_url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
"name": "stream1",
|
||||
"source_defined_cursor": True,
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"is_resumable": True,
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
).build()
|
||||
|
||||
multiple_excel_combine_schema_scenario = (
|
||||
TestScenarioBuilder()
|
||||
.set_name("multiple_excel_combine_schema_stream")
|
||||
.set_config(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"name": "stream1",
|
||||
"format": {"filetype": "excel"},
|
||||
"globs": ["*"],
|
||||
"validation_policy": "Emit Record",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
.set_source_builder(
|
||||
FileBasedSourceBuilder()
|
||||
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_combine_schema_file, file_type="excel"))
|
||||
.set_file_type("excel")
|
||||
)
|
||||
.set_expected_records(
|
||||
[
|
||||
{
|
||||
"data": {
|
||||
"col_double": 20.02,
|
||||
"col_string": "Robbers",
|
||||
"col_album": "The 1975",
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "a.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_double": 20.23,
|
||||
"col_string": "Somebody Else",
|
||||
"col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It",
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "a.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_double": 1975.1975,
|
||||
"col_string": "It's Not Living (If It's Not with You)",
|
||||
"col_song": "Love It If We Made It",
|
||||
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "b.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_double": 5791.5791,
|
||||
"col_string": "The 1975",
|
||||
"col_song": "About You",
|
||||
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "b.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
]
|
||||
)
|
||||
.set_expected_catalog(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"default_cursor_field": ["_ab_source_file_last_modified"],
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"col_double": {"type": ["null", "number"]},
|
||||
"col_string": {"type": ["null", "string"]},
|
||||
"col_album": {"type": ["null", "string"]},
|
||||
"col_song": {"type": ["null", "string"]},
|
||||
"_ab_source_file_last_modified": {"type": "string"},
|
||||
"_ab_source_file_url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
"name": "stream1",
|
||||
"source_defined_cursor": True,
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"is_resumable": True,
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
).build()
|
||||
|
||||
excel_all_types_scenario = (
|
||||
TestScenarioBuilder()
|
||||
.set_name("excel_all_types_stream")
|
||||
.set_config(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"name": "stream1",
|
||||
"format": {"filetype": "excel"},
|
||||
"globs": ["*"],
|
||||
"validation_policy": "Emit Record",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
.set_source_builder(
|
||||
FileBasedSourceBuilder()
|
||||
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_excel_all_types_file, file_type="excel"))
|
||||
.set_file_type("excel")
|
||||
)
|
||||
.set_expected_records(
|
||||
[
|
||||
{
|
||||
"data": {
|
||||
"col_bool": True,
|
||||
"col_int": 27,
|
||||
"col_long": 1992,
|
||||
"col_float": 999.09723456,
|
||||
"col_string": "Love It If We Made It",
|
||||
"col_date": "2022-05-29T00:00:00",
|
||||
"col_time_millis": "06:00:00.456000",
|
||||
"col_time_micros": "12:00:00.456789",
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "a.xlsx",
|
||||
},
|
||||
"stream": "stream1",
|
||||
},
|
||||
]
|
||||
)
|
||||
.set_expected_catalog(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"default_cursor_field": ["_ab_source_file_last_modified"],
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"col_bool": {"type": ["null", "boolean"]},
|
||||
"col_int": {"type": ["null", "number"]},
|
||||
"col_long": {"type": ["null", "number"]},
|
||||
"col_float": {"type": ["null", "number"]},
|
||||
"col_string": {"type": ["null", "string"]},
|
||||
"col_date": {"format": "date-time", "type": ["null", "string"]},
|
||||
"col_time_millis": {"type": ["null", "string"]},
|
||||
"col_time_micros": {"type": ["null", "string"]},
|
||||
"_ab_source_file_last_modified": {"type": "string"},
|
||||
"_ab_source_file_url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
"name": "stream1",
|
||||
"source_defined_cursor": True,
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"is_resumable": True,
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
).build()
|
||||
|
||||
multiple_streams_excel_scenario = (
|
||||
TestScenarioBuilder()
|
||||
.set_name("multiple_streams_excel_stream")
|
||||
.set_config(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"name": "songs_stream",
|
||||
"format": {"filetype": "excel"},
|
||||
"globs": ["*_songs.xlsx"],
|
||||
"validation_policy": "Emit Record",
|
||||
},
|
||||
{
|
||||
"name": "festivals_stream",
|
||||
"format": {"filetype": "excel"},
|
||||
"globs": ["*_festivals.xlsx"],
|
||||
"validation_policy": "Emit Record",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
.set_source_builder(
|
||||
FileBasedSourceBuilder()
|
||||
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_stream_file, file_type="excel"))
|
||||
.set_file_type("excel")
|
||||
)
|
||||
.set_expected_records(
|
||||
[
|
||||
{
|
||||
"data": {
|
||||
"col_title": "Late Night",
|
||||
"col_album": "A_MOMENT_APART",
|
||||
"col_year": 2017,
|
||||
"col_vocals": False,
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "odesza_songs.xlsx",
|
||||
},
|
||||
"stream": "songs_stream",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_title": "White Lies",
|
||||
"col_album": "IN_RETURN",
|
||||
"col_year": 2014,
|
||||
"col_vocals": True,
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "odesza_songs.xlsx",
|
||||
},
|
||||
"stream": "songs_stream",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_title": "Wide Awake",
|
||||
"col_album": "THE_LAST_GOODBYE",
|
||||
"col_year": 2022,
|
||||
"col_vocals": True,
|
||||
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "odesza_songs.xlsx",
|
||||
},
|
||||
"stream": "songs_stream",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_name": "Lightning in a Bottle",
|
||||
"col_location": "{'country': 'USA', 'state': 'California', 'city': 'Buena Vista Lake'}",
|
||||
"col_attendance": 18000,
|
||||
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "california_festivals.xlsx",
|
||||
},
|
||||
"stream": "festivals_stream",
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"col_name": "Outside Lands",
|
||||
"col_location": "{'country': 'USA', 'state': 'California', 'city': 'San Francisco'}",
|
||||
"col_attendance": 220000,
|
||||
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
|
||||
"_ab_source_file_url": "california_festivals.xlsx",
|
||||
},
|
||||
"stream": "festivals_stream",
|
||||
},
|
||||
]
|
||||
)
|
||||
.set_expected_catalog(
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"default_cursor_field": ["_ab_source_file_last_modified"],
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"col_title": {"type": ["null", "string"]},
|
||||
"col_album": {"type": ["null", "string"]},
|
||||
"col_year": {"type": ["null", "number"]},
|
||||
"col_vocals": {"type": ["null", "boolean"]},
|
||||
"_ab_source_file_last_modified": {"type": "string"},
|
||||
"_ab_source_file_url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
"name": "songs_stream",
|
||||
"source_defined_cursor": True,
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"is_resumable": True,
|
||||
},
|
||||
{
|
||||
"default_cursor_field": ["_ab_source_file_last_modified"],
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"col_name": {"type": ["null", "string"]},
|
||||
"col_location": {"type": ["null", "string"]},
|
||||
"col_attendance": {"type": ["null", "number"]},
|
||||
"_ab_source_file_last_modified": {"type": "string"},
|
||||
"_ab_source_file_url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
"name": "festivals_stream",
|
||||
"source_defined_cursor": True,
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"is_resumable": True,
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
).build()
|
||||
@@ -86,6 +86,12 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
||||
schemaless_with_user_input_schema_fails_connection_check_scenario,
|
||||
single_csv_scenario,
|
||||
)
|
||||
from unit_tests.sources.file_based.scenarios.excel_scenarios import (
|
||||
excel_all_types_scenario,
|
||||
multiple_excel_combine_schema_scenario,
|
||||
multiple_streams_excel_scenario,
|
||||
single_excel_scenario,
|
||||
)
|
||||
from unit_tests.sources.file_based.scenarios.incremental_scenarios import (
|
||||
multi_csv_different_timestamps_scenario,
|
||||
multi_csv_include_missing_files_within_history_range,
|
||||
@@ -232,6 +238,10 @@ discover_success_scenarios = [
|
||||
multiple_avro_combine_schema_scenario,
|
||||
multiple_streams_avro_scenario,
|
||||
avro_file_with_double_as_number_scenario,
|
||||
excel_all_types_scenario,
|
||||
multiple_excel_combine_schema_scenario,
|
||||
multiple_streams_excel_scenario,
|
||||
single_excel_scenario,
|
||||
csv_newline_in_values_not_quoted_scenario,
|
||||
csv_autogenerate_column_names_scenario,
|
||||
parquet_with_invalid_config_scenario,
|
||||
@@ -299,6 +309,7 @@ check_scenarios = [
|
||||
schemaless_with_user_input_schema_fails_connection_check_scenario,
|
||||
valid_single_stream_user_input_schema_scenario,
|
||||
single_avro_scenario,
|
||||
single_excel_scenario,
|
||||
earlier_csv_scenario,
|
||||
csv_no_files_scenario,
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user