1
0
mirror of synced 2025-12-25 02:09:19 -05:00

[file-based cdk] add excel file type support (#43346)

This commit is contained in:
Serhii Lazebnyi
2024-08-14 15:05:15 +02:00
committed by GitHub
parent 7bb7a74129
commit aaaf12e055
11 changed files with 1712 additions and 758 deletions

View File

@@ -0,0 +1,17 @@
#
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
#
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic.v1 import BaseModel, Field
class ExcelFormat(BaseModel):
class Config(OneOfOptionConfig):
title = "Excel Format"
discriminator = "filetype"
filetype: str = Field(
"excel",
const=True,
)

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
#
from enum import Enum
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping, Optional, Union
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
@@ -55,7 +56,7 @@ class FileBasedStreamConfig(BaseModel):
description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
default=3,
)
format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat] = Field(
format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat] = Field(
title="Format",
description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
)

View File

@@ -1,6 +1,7 @@
from typing import Any, Mapping, Type
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
@@ -8,6 +9,7 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import Unstructur
from .avro_parser import AvroParser
from .csv_parser import CsvParser
from .excel_parser import ExcelParser
from .file_type_parser import FileTypeParser
from .jsonl_parser import JsonlParser
from .parquet_parser import ParquetParser
@@ -16,9 +18,10 @@ from .unstructured_parser import UnstructuredParser
default_parsers: Mapping[Type[Any], FileTypeParser] = {
AvroFormat: AvroParser(),
CsvFormat: CsvParser(),
ExcelFormat: ExcelParser(),
JsonlFormat: JsonlParser(),
ParquetFormat: ParquetParser(),
UnstructuredFormat: UnstructuredParser(),
}
__all__ = ["AvroParser", "CsvParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
__all__ = ["AvroParser", "CsvParser", "ExcelParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]

View File

@@ -0,0 +1,168 @@
#
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
#
import logging
from io import IOBase
from pathlib import Path
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
import pandas as pd
from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
from numpy import datetime64
from numpy import dtype as dtype_
from numpy import issubdtype
from pydantic.v1 import BaseModel
class ExcelParser(FileTypeParser):
ENCODING = None
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
"""
ExcelParser does not require config checks, implicit pydantic validation is enough.
"""
return True, None
async def infer_schema(
self,
config: FileBasedStreamConfig,
file: RemoteFile,
stream_reader: AbstractFileBasedStreamReader,
logger: logging.Logger,
) -> SchemaType:
"""
Infers the schema of the Excel file by examining its contents.
Args:
config (FileBasedStreamConfig): Configuration for the file-based stream.
file (RemoteFile): The remote file to be read.
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
logger (logging.Logger): Logger for logging information and errors.
Returns:
SchemaType: Inferred schema of the Excel file.
"""
# Validate the format of the config
self.validate_format(config.format, logger)
fields: Dict[str, str] = {}
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
for column, df_type in df.dtypes.items():
# Choose the broadest data type if the column's data type differs in dataframes
prev_frame_column_type = fields.get(column)
fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type)
schema = {
field: ({"type": "string", "format": "date-time"} if fields[field] == "date-time" else {"type": fields[field]})
for field in fields
}
return schema
def parse_records(
self,
config: FileBasedStreamConfig,
file: RemoteFile,
stream_reader: AbstractFileBasedStreamReader,
logger: logging.Logger,
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
) -> Iterable[Dict[str, Any]]:
"""
Parses records from an Excel file based on the provided configuration.
Args:
config (FileBasedStreamConfig): Configuration for the file-based stream.
file (RemoteFile): The remote file to be read.
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
logger (logging.Logger): Logger for logging information and errors.
discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation.
Yields:
Iterable[Dict[str, Any]]: Parsed records from the Excel file.
"""
# Validate the format of the config
self.validate_format(config.format, logger)
try:
# Open and parse the file using the stream reader
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
# Yield records as dictionaries
yield from df.to_dict(orient="records")
except Exception as exc:
# Raise a RecordParseError if any exception occurs during parsing
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) from exc
@property
def file_read_mode(self) -> FileReadMode:
"""
Returns the file read mode for the Excel file.
Returns:
FileReadMode: The file read mode (binary).
"""
return FileReadMode.READ_BINARY
@staticmethod
def dtype_to_json_type(current_type: Optional[str], dtype: dtype_) -> str:
"""
Convert Pandas DataFrame types to Airbyte Types.
Args:
current_type (Optional[str]): One of the previous types based on earlier dataframes.
dtype: Pandas DataFrame type.
Returns:
str: Corresponding Airbyte Type.
"""
number_types = ("int64", "float64")
if current_type == "string":
# Previous column values were of the string type, no need to look further.
return current_type
if dtype == object:
return "string"
if dtype in number_types and (not current_type or current_type == "number"):
return "number"
if dtype == "bool" and (not current_type or current_type == "boolean"):
return "boolean"
if issubdtype(dtype, datetime64):
return "date-time"
return "string"
@staticmethod
def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
"""
Validates if the given format is of type ExcelFormat.
Args:
excel_format (Any): The format to be validated.
Raises:
ConfigValidationError: If the format is not ExcelFormat.
"""
if not isinstance(excel_format, ExcelFormat):
logger.info(f"Expected ExcelFormat, got {excel_format}")
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
@staticmethod
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
"""
Opens and parses the Excel file.
Args:
fp: File pointer to the Excel file.
Returns:
pd.DataFrame: Parsed data from the Excel file.
"""
return pd.ExcelFile(fp, engine="calamine").parse()

File diff suppressed because it is too large Load Diff

View File

@@ -50,13 +50,16 @@ langchain = { version = "0.1.16", optional = true }
langchain_core = { version = "0.1.42", optional = true }
markdown = { version = "*", optional = true }
openai = { version = "0.27.9", extras = ["embeddings"], optional = true }
pandas = { version = "2.2.0", optional = true }
pdf2image = { version = "1.16.3", optional = true }
"pdfminer.six" = { version = "20221105", optional = true }
pyarrow = { version = "~15.0.0", optional = true }
pytesseract = { version = "0.3.10", optional = true }
python-calamine = { version = "0.2.3", optional = true }
Sphinx = { version = "~4.2", optional = true }
sphinx-rtd-theme = { version = "~1.0", optional = true }
tiktoken = { version = "0.4.0", optional = true }
nltk = { version = "3.8.1", optional = true }
unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true }
"unstructured.pytesseract" = { version = ">=0.3.12", optional = true }
pyjwt = "^2.8.0"
@@ -66,7 +69,7 @@ pytz = "2024.1"
[tool.poetry.group.dev.dependencies]
freezegun = "*"
mypy = "*"
pandas = "2.0.3"
asyncio = "3.4.3"
poethepoet = "^0.24.2"
pyproject-flake8 = "^6.1.0"
pytest = "^7"
@@ -77,7 +80,7 @@ pytest-mock = "*"
requests-mock = "*"
[tool.poetry.extras]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown"]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "pandas"]
sphinx-docs = ["Sphinx", "sphinx-rtd-theme"]
vector-db-based = ["langchain", "openai", "cohere", "tiktoken"]

View File

@@ -0,0 +1,120 @@
#
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
#
import datetime
from io import BytesIO
from unittest.mock import MagicMock, Mock, mock_open, patch
import pandas as pd
import pytest
from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig, ValidationPolicy
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, RecordParseError
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
from airbyte_cdk.sources.file_based.file_types.excel_parser import ExcelParser
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
@pytest.fixture
def mock_stream_reader():
return Mock(spec=AbstractFileBasedStreamReader)
@pytest.fixture
def mock_logger():
return Mock()
@pytest.fixture
def file_config():
return FileBasedStreamConfig(
name="test.xlsx",
file_type="excel",
format=ExcelFormat(sheet_name="Sheet1"),
validation_policy=ValidationPolicy.emit_record,
)
@pytest.fixture
def remote_file():
return RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now())
@pytest.fixture
def setup_parser(remote_file):
parser = ExcelParser()
# Sample data for the mock Excel file
data = pd.DataFrame({
"column1": [1, 2, 3],
"column2": ["a", "b", "c"],
"column3": [True, False, True],
"column4": pd.to_datetime(["2021-01-01", "2022-01-01", "2023-01-01"]),
})
# Convert the DataFrame to an Excel byte stream
excel_bytes = BytesIO()
with pd.ExcelWriter(excel_bytes, engine="xlsxwriter") as writer:
data.to_excel(writer, index=False)
excel_bytes.seek(0)
# Mock the stream_reader's open_file method to return the Excel byte stream
stream_reader = MagicMock(spec=AbstractFileBasedStreamReader)
stream_reader.open_file.return_value = BytesIO(excel_bytes.read())
return parser, FileBasedStreamConfig(name="test_stream", format=ExcelFormat()), remote_file, stream_reader, MagicMock(), data
@patch("pandas.ExcelFile")
@pytest.mark.asyncio
async def test_infer_schema(mock_excel_file, setup_parser):
parser, config, file, stream_reader, logger, data = setup_parser
# Mock the parse method of the pandas ExcelFile object
mock_excel_file.return_value.parse.return_value = data
# Call infer_schema
schema = await parser.infer_schema(config, file, stream_reader, logger)
# Define the expected schema
expected_schema: SchemaType = {
"column1": {"type": "number"},
"column2": {"type": "string"},
"column3": {"type": "boolean"},
"column4": {"type": "string", "format": "date-time"},
}
# Validate the schema
assert schema == expected_schema
# Assert that the stream_reader's open_file was called correctly
stream_reader.open_file.assert_called_once_with(file, parser.file_read_mode, parser.ENCODING, logger)
# Assert that the logger was not used for warnings/errors
logger.info.assert_not_called()
logger.error.assert_not_called()
def test_invalid_format(mock_stream_reader, mock_logger, remote_file):
parser = ExcelParser()
invalid_config = FileBasedStreamConfig(
name="test.xlsx",
file_type="csv",
format={"filetype": "csv"},
validation_policy=ValidationPolicy.emit_record,
)
with pytest.raises(ConfigValidationError):
list(parser.parse_records(invalid_config, remote_file, mock_stream_reader, mock_logger))
def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_file):
parser = ExcelParser()
with patch("builtins.open", mock_open(read_data=b"corrupted data")):
with patch("pandas.ExcelFile") as mock_excel:
mock_excel.return_value.parse.side_effect = ValueError("Failed to parse file")
with pytest.raises(RecordParseError):
list(parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger))

View File

@@ -209,3 +209,23 @@ class TemporaryAvroFilesStreamReader(InMemoryFilesStreamReader):
file_writer.flush()
fp.seek(0)
return fp.read()
class TemporaryExcelFilesStreamReader(InMemoryFilesStreamReader):
"""
A file reader that writes RemoteFiles to a temporary file and then reads them back.
"""
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
return io.BytesIO(self._make_file_contents(file.uri))
def _make_file_contents(self, file_name: str) -> bytes:
contents = self.files[file_name]["contents"]
df = pd.DataFrame(contents)
with io.BytesIO() as fp:
writer = pd.ExcelWriter(fp, engine='xlsxwriter')
df.to_excel(writer, index=False, sheet_name='Sheet1')
writer._save()
fp.seek(0)
return fp.read()

View File

@@ -411,6 +411,14 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
"required": ["filetype"],
},
{
"title": "Excel Format",
"type": "object",
"properties": {
"filetype": {"title": "Filetype", "default": "excel", "const": "excel", "type": "string"}
},
"required": ["filetype"]
}
],
},
"schemaless": {

View File

@@ -0,0 +1,424 @@
#
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
#
import datetime
from unit_tests.sources.file_based.in_memory_files_source import TemporaryExcelFilesStreamReader
from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
_single_excel_file = {
"a.xlsx": {
"contents": [
{"col1": "val11", "col2": "val12"},
{"col1": "val21", "col2": "val22"},
],
"last_modified": "2023-06-05T03:54:07.000Z",
}
}
_multiple_excel_combine_schema_file = {
"a.xlsx": {
"contents": [
{"col_double": 20.02, "col_string": "Robbers", "col_album": "The 1975"},
{"col_double": 20.23, "col_string": "Somebody Else", "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"},
],
"last_modified": "2023-06-05T03:54:07.000Z",
},
"b.xlsx": {
"contents": [
{"col_double": 1975.1975, "col_string": "It's Not Living (If It's Not with You)", "col_song": "Love It If We Made It"},
{"col_double": 5791.5791, "col_string": "The 1975", "col_song": "About You"},
],
"last_modified": "2023-06-06T03:54:07.000Z",
},
}
_excel_all_types_file = {
"a.xlsx": {
"contents": [
{
"col_bool": True,
"col_int": 27,
"col_long": 1992,
"col_float": 999.09723456,
"col_string": "Love It If We Made It",
"col_date": datetime.date(2022, 5, 29),
"col_time_millis": datetime.time(6, 0, 0, 456000),
"col_time_micros": datetime.time(12, 0, 0, 456789),
}
],
"last_modified": "2023-06-05T03:54:07.000Z",
}
}
_multiple_excel_stream_file = {
"odesza_songs.xlsx": {
"contents": [
{"col_title": "Late Night", "col_album": "A_MOMENT_APART", "col_year": 2017, "col_vocals": False},
{"col_title": "White Lies", "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True},
{"col_title": "Wide Awake", "col_album": "THE_LAST_GOODBYE", "col_year": 2022, "col_vocals": True},
],
"last_modified": "2023-06-05T03:54:07.000Z"
},
"california_festivals.xlsx": {
"contents": [
{"col_name": "Lightning in a Bottle", "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, "col_attendance": 18000},
{"col_name": "Outside Lands", "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, "col_attendance": 220000},
],
"last_modified": "2023-06-06T03:54:07.000Z"
},
}
single_excel_scenario = (
TestScenarioBuilder()
.set_name("single_excel_stream")
.set_config(
{
"streams": [
{
"name": "stream1",
"format": {"filetype": "excel"},
"globs": ["*"],
"validation_policy": "Emit Record",
}
]
}
)
.set_source_builder(
FileBasedSourceBuilder()
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_single_excel_file, file_type="excel"))
.set_file_type("excel")
)
.set_expected_check_status("SUCCEEDED")
.set_expected_records(
[
{
"data": {
"col1": "val11",
"col2": "val12",
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "a.xlsx",
},
"stream": "stream1",
},
{
"data": {
"col1": "val21",
"col2": "val22",
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "a.xlsx",
},
"stream": "stream1",
},
]
)
.set_expected_catalog(
{
"streams": [
{
"default_cursor_field": ["_ab_source_file_last_modified"],
"json_schema": {
"type": "object",
"properties": {
"col1": {"type": ["null", "string"]},
"col2": {"type": ["null", "string"]},
"_ab_source_file_last_modified": {"type": "string"},
"_ab_source_file_url": {"type": "string"},
},
},
"name": "stream1",
"source_defined_cursor": True,
"supported_sync_modes": ["full_refresh", "incremental"],
"is_resumable": True,
}
]
}
)
).build()
multiple_excel_combine_schema_scenario = (
TestScenarioBuilder()
.set_name("multiple_excel_combine_schema_stream")
.set_config(
{
"streams": [
{
"name": "stream1",
"format": {"filetype": "excel"},
"globs": ["*"],
"validation_policy": "Emit Record",
}
]
}
)
.set_source_builder(
FileBasedSourceBuilder()
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_combine_schema_file, file_type="excel"))
.set_file_type("excel")
)
.set_expected_records(
[
{
"data": {
"col_double": 20.02,
"col_string": "Robbers",
"col_album": "The 1975",
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "a.xlsx",
},
"stream": "stream1",
},
{
"data": {
"col_double": 20.23,
"col_string": "Somebody Else",
"col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It",
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "a.xlsx",
},
"stream": "stream1",
},
{
"data": {
"col_double": 1975.1975,
"col_string": "It's Not Living (If It's Not with You)",
"col_song": "Love It If We Made It",
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
"_ab_source_file_url": "b.xlsx",
},
"stream": "stream1",
},
{
"data": {
"col_double": 5791.5791,
"col_string": "The 1975",
"col_song": "About You",
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
"_ab_source_file_url": "b.xlsx",
},
"stream": "stream1",
},
]
)
.set_expected_catalog(
{
"streams": [
{
"default_cursor_field": ["_ab_source_file_last_modified"],
"json_schema": {
"type": "object",
"properties": {
"col_double": {"type": ["null", "number"]},
"col_string": {"type": ["null", "string"]},
"col_album": {"type": ["null", "string"]},
"col_song": {"type": ["null", "string"]},
"_ab_source_file_last_modified": {"type": "string"},
"_ab_source_file_url": {"type": "string"},
},
},
"name": "stream1",
"source_defined_cursor": True,
"supported_sync_modes": ["full_refresh", "incremental"],
"is_resumable": True,
}
]
}
)
).build()
excel_all_types_scenario = (
TestScenarioBuilder()
.set_name("excel_all_types_stream")
.set_config(
{
"streams": [
{
"name": "stream1",
"format": {"filetype": "excel"},
"globs": ["*"],
"validation_policy": "Emit Record",
}
]
}
)
.set_source_builder(
FileBasedSourceBuilder()
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_excel_all_types_file, file_type="excel"))
.set_file_type("excel")
)
.set_expected_records(
[
{
"data": {
"col_bool": True,
"col_int": 27,
"col_long": 1992,
"col_float": 999.09723456,
"col_string": "Love It If We Made It",
"col_date": "2022-05-29T00:00:00",
"col_time_millis": "06:00:00.456000",
"col_time_micros": "12:00:00.456789",
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "a.xlsx",
},
"stream": "stream1",
},
]
)
.set_expected_catalog(
{
"streams": [
{
"default_cursor_field": ["_ab_source_file_last_modified"],
"json_schema": {
"type": "object",
"properties": {
"col_bool": {"type": ["null", "boolean"]},
"col_int": {"type": ["null", "number"]},
"col_long": {"type": ["null", "number"]},
"col_float": {"type": ["null", "number"]},
"col_string": {"type": ["null", "string"]},
"col_date": {"format": "date-time", "type": ["null", "string"]},
"col_time_millis": {"type": ["null", "string"]},
"col_time_micros": {"type": ["null", "string"]},
"_ab_source_file_last_modified": {"type": "string"},
"_ab_source_file_url": {"type": "string"},
},
},
"name": "stream1",
"source_defined_cursor": True,
"supported_sync_modes": ["full_refresh", "incremental"],
"is_resumable": True,
}
]
}
)
).build()
multiple_streams_excel_scenario = (
TestScenarioBuilder()
.set_name("multiple_streams_excel_stream")
.set_config(
{
"streams": [
{
"name": "songs_stream",
"format": {"filetype": "excel"},
"globs": ["*_songs.xlsx"],
"validation_policy": "Emit Record",
},
{
"name": "festivals_stream",
"format": {"filetype": "excel"},
"globs": ["*_festivals.xlsx"],
"validation_policy": "Emit Record",
},
]
}
)
.set_source_builder(
FileBasedSourceBuilder()
.set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_stream_file, file_type="excel"))
.set_file_type("excel")
)
.set_expected_records(
[
{
"data": {
"col_title": "Late Night",
"col_album": "A_MOMENT_APART",
"col_year": 2017,
"col_vocals": False,
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "odesza_songs.xlsx",
},
"stream": "songs_stream",
},
{
"data": {
"col_title": "White Lies",
"col_album": "IN_RETURN",
"col_year": 2014,
"col_vocals": True,
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "odesza_songs.xlsx",
},
"stream": "songs_stream",
},
{
"data": {
"col_title": "Wide Awake",
"col_album": "THE_LAST_GOODBYE",
"col_year": 2022,
"col_vocals": True,
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
"_ab_source_file_url": "odesza_songs.xlsx",
},
"stream": "songs_stream",
},
{
"data": {
"col_name": "Lightning in a Bottle",
"col_location": "{'country': 'USA', 'state': 'California', 'city': 'Buena Vista Lake'}",
"col_attendance": 18000,
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
"_ab_source_file_url": "california_festivals.xlsx",
},
"stream": "festivals_stream",
},
{
"data": {
"col_name": "Outside Lands",
"col_location": "{'country': 'USA', 'state': 'California', 'city': 'San Francisco'}",
"col_attendance": 220000,
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
"_ab_source_file_url": "california_festivals.xlsx",
},
"stream": "festivals_stream",
},
]
)
.set_expected_catalog(
{
"streams": [
{
"default_cursor_field": ["_ab_source_file_last_modified"],
"json_schema": {
"type": "object",
"properties": {
"col_title": {"type": ["null", "string"]},
"col_album": {"type": ["null", "string"]},
"col_year": {"type": ["null", "number"]},
"col_vocals": {"type": ["null", "boolean"]},
"_ab_source_file_last_modified": {"type": "string"},
"_ab_source_file_url": {"type": "string"},
},
},
"name": "songs_stream",
"source_defined_cursor": True,
"supported_sync_modes": ["full_refresh", "incremental"],
"is_resumable": True,
},
{
"default_cursor_field": ["_ab_source_file_last_modified"],
"json_schema": {
"type": "object",
"properties": {
"col_name": {"type": ["null", "string"]},
"col_location": {"type": ["null", "string"]},
"col_attendance": {"type": ["null", "number"]},
"_ab_source_file_last_modified": {"type": "string"},
"_ab_source_file_url": {"type": "string"},
},
},
"name": "festivals_stream",
"source_defined_cursor": True,
"supported_sync_modes": ["full_refresh", "incremental"],
"is_resumable": True,
},
]
}
)
).build()

View File

@@ -86,6 +86,12 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
schemaless_with_user_input_schema_fails_connection_check_scenario,
single_csv_scenario,
)
from unit_tests.sources.file_based.scenarios.excel_scenarios import (
excel_all_types_scenario,
multiple_excel_combine_schema_scenario,
multiple_streams_excel_scenario,
single_excel_scenario,
)
from unit_tests.sources.file_based.scenarios.incremental_scenarios import (
multi_csv_different_timestamps_scenario,
multi_csv_include_missing_files_within_history_range,
@@ -232,6 +238,10 @@ discover_success_scenarios = [
multiple_avro_combine_schema_scenario,
multiple_streams_avro_scenario,
avro_file_with_double_as_number_scenario,
excel_all_types_scenario,
multiple_excel_combine_schema_scenario,
multiple_streams_excel_scenario,
single_excel_scenario,
csv_newline_in_values_not_quoted_scenario,
csv_autogenerate_column_names_scenario,
parquet_with_invalid_config_scenario,
@@ -299,6 +309,7 @@ check_scenarios = [
schemaless_with_user_input_schema_fails_connection_check_scenario,
valid_single_stream_user_input_schema_scenario,
single_avro_scenario,
single_excel_scenario,
earlier_csv_scenario,
csv_no_files_scenario,
]