[file-based cdk] add excel file type support (#43346)

2025-12-25 02:09:19 -05:00 · 2024-08-14 15:05:15 +02:00
parent 7bb7a74129
commit aaaf12e055
11 changed files with 1712 additions and 758 deletions
--- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/excel_format.py
+++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/excel_format.py
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+#
+
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
+from pydantic.v1 import BaseModel, Field
+
+
+class ExcelFormat(BaseModel):
+    class Config(OneOfOptionConfig):
+        title = "Excel Format"
+        discriminator = "filetype"
+
+    filetype: str = Field(
+        "excel",
+        const=True,
+    )
--- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py
+++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
 #

 from enum import Enum
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping, Optional, Union

 from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
+from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
 from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
 from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
 from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
@@ -55,7 +56,7 @@ class FileBasedStreamConfig(BaseModel):
        description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
        default=3,
    )
-    format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat] = Field(
+    format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat] = Field(
        title="Format",
        description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
    )
--- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/init.py
+++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/init.py
@@ -1,6 +1,7 @@
 from typing import Any, Mapping, Type

 from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
+from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
 from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
 from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
@@ -8,6 +9,7 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import Unstructur

 from .avro_parser import AvroParser
 from .csv_parser import CsvParser
+from .excel_parser import ExcelParser
 from .file_type_parser import FileTypeParser
 from .jsonl_parser import JsonlParser
 from .parquet_parser import ParquetParser
@@ -16,9 +18,10 @@ from .unstructured_parser import UnstructuredParser
 default_parsers: Mapping[Type[Any], FileTypeParser] = {
    AvroFormat: AvroParser(),
    CsvFormat: CsvParser(),
+    ExcelFormat: ExcelParser(),
    JsonlFormat: JsonlParser(),
    ParquetFormat: ParquetParser(),
    UnstructuredFormat: UnstructuredParser(),
 }

-__all__ = ["AvroParser", "CsvParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
+__all__ = ["AvroParser", "CsvParser", "ExcelParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
--- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/excel_parser.py
+++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/excel_parser.py
@@ -0,0 +1,168 @@
+#
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+#
+
+import logging
+from io import IOBase
+from pathlib import Path
+from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
+
+import pandas as pd
+from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig
+from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
+from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
+from airbyte_cdk.sources.file_based.remote_file import RemoteFile
+from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
+from numpy import datetime64
+from numpy import dtype as dtype_
+from numpy import issubdtype
+from pydantic.v1 import BaseModel
+
+
+class ExcelParser(FileTypeParser):
+    ENCODING = None
+
+    def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
+        """
+        ExcelParser does not require config checks, implicit pydantic validation is enough.
+        """
+        return True, None
+
+    async def infer_schema(
+        self,
+        config: FileBasedStreamConfig,
+        file: RemoteFile,
+        stream_reader: AbstractFileBasedStreamReader,
+        logger: logging.Logger,
+    ) -> SchemaType:
+        """
+        Infers the schema of the Excel file by examining its contents.
+
+        Args:
+            config (FileBasedStreamConfig): Configuration for the file-based stream.
+            file (RemoteFile): The remote file to be read.
+            stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
+            logger (logging.Logger): Logger for logging information and errors.
+
+        Returns:
+            SchemaType: Inferred schema of the Excel file.
+        """
+
+        # Validate the format of the config
+        self.validate_format(config.format, logger)
+
+        fields: Dict[str, str] = {}
+
+        with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
+            df = self.open_and_parse_file(fp)
+            for column, df_type in df.dtypes.items():
+                # Choose the broadest data type if the column's data type differs in dataframes
+                prev_frame_column_type = fields.get(column)
+                fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type)
+
+        schema = {
+            field: ({"type": "string", "format": "date-time"} if fields[field] == "date-time" else {"type": fields[field]})
+            for field in fields
+        }
+        return schema
+
+    def parse_records(
+        self,
+        config: FileBasedStreamConfig,
+        file: RemoteFile,
+        stream_reader: AbstractFileBasedStreamReader,
+        logger: logging.Logger,
+        discovered_schema: Optional[Mapping[str, SchemaType]] = None,
+    ) -> Iterable[Dict[str, Any]]:
+        """
+        Parses records from an Excel file based on the provided configuration.
+
+        Args:
+            config (FileBasedStreamConfig): Configuration for the file-based stream.
+            file (RemoteFile): The remote file to be read.
+            stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
+            logger (logging.Logger): Logger for logging information and errors.
+            discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation.
+
+        Yields:
+            Iterable[Dict[str, Any]]: Parsed records from the Excel file.
+        """
+
+        # Validate the format of the config
+        self.validate_format(config.format, logger)
+
+        try:
+            # Open and parse the file using the stream reader
+            with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
+                df = self.open_and_parse_file(fp)
+                # Yield records as dictionaries
+                yield from df.to_dict(orient="records")
+
+        except Exception as exc:
+            # Raise a RecordParseError if any exception occurs during parsing
+            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) from exc
+
+    @property
+    def file_read_mode(self) -> FileReadMode:
+        """
+        Returns the file read mode for the Excel file.
+
+        Returns:
+            FileReadMode: The file read mode (binary).
+        """
+        return FileReadMode.READ_BINARY
+
+    @staticmethod
+    def dtype_to_json_type(current_type: Optional[str], dtype: dtype_) -> str:
+        """
+        Convert Pandas DataFrame types to Airbyte Types.
+
+        Args:
+            current_type (Optional[str]): One of the previous types based on earlier dataframes.
+            dtype: Pandas DataFrame type.
+
+        Returns:
+            str: Corresponding Airbyte Type.
+        """
+        number_types = ("int64", "float64")
+        if current_type == "string":
+            # Previous column values were of the string type, no need to look further.
+            return current_type
+        if dtype == object:
+            return "string"
+        if dtype in number_types and (not current_type or current_type == "number"):
+            return "number"
+        if dtype == "bool" and (not current_type or current_type == "boolean"):
+            return "boolean"
+        if issubdtype(dtype, datetime64):
+            return "date-time"
+        return "string"
+
+    @staticmethod
+    def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
+        """
+        Validates if the given format is of type ExcelFormat.
+
+        Args:
+            excel_format (Any): The format to be validated.
+
+        Raises:
+            ConfigValidationError: If the format is not ExcelFormat.
+        """
+        if not isinstance(excel_format, ExcelFormat):
+            logger.info(f"Expected ExcelFormat, got {excel_format}")
+            raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
+
+    @staticmethod
+    def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
+        """
+        Opens and parses the Excel file.
+
+        Args:
+            fp: File pointer to the Excel file.
+
+        Returns:
+            pd.DataFrame: Parsed data from the Excel file.
+        """
+        return pd.ExcelFile(fp, engine="calamine").parse()
--- a/airbyte-cdk/python/poetry.lock
+++ b/airbyte-cdk/python/poetry.lock
--- a/airbyte-cdk/python/pyproject.toml
+++ b/airbyte-cdk/python/pyproject.toml
@@ -50,13 +50,16 @@ langchain = { version = "0.1.16", optional = true }
 langchain_core = { version = "0.1.42", optional = true }
 markdown = { version = "*", optional = true }
 openai = { version = "0.27.9", extras = ["embeddings"], optional = true }
+pandas = { version = "2.2.0", optional = true }
 pdf2image = { version = "1.16.3", optional = true }
 "pdfminer.six" = { version = "20221105", optional = true }
 pyarrow = { version = "~15.0.0", optional = true }
 pytesseract = { version = "0.3.10", optional = true }
+python-calamine = { version = "0.2.3", optional = true }
 Sphinx = { version = "~4.2", optional = true }
 sphinx-rtd-theme = { version = "~1.0", optional = true }
 tiktoken = { version = "0.4.0", optional = true }
+nltk = { version = "3.8.1", optional = true }
 unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true }
 "unstructured.pytesseract" = { version = ">=0.3.12", optional = true }
 pyjwt = "^2.8.0"
@@ -66,7 +69,7 @@ pytz = "2024.1"
 [tool.poetry.group.dev.dependencies]
 freezegun = "*"
 mypy = "*"
-pandas = "2.0.3"
+asyncio = "3.4.3"
 poethepoet = "^0.24.2"
 pyproject-flake8 = "^6.1.0"
 pytest = "^7"
@@ -77,7 +80,7 @@ pytest-mock = "*"
 requests-mock = "*"

 [tool.poetry.extras]
-file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown"]
+file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "pandas"]
 sphinx-docs = ["Sphinx", "sphinx-rtd-theme"]
 vector-db-based = ["langchain", "openai", "cohere", "tiktoken"]

--- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_excel_parser.py
+++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_excel_parser.py
@@ -0,0 +1,120 @@
+#
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+#
+
+
+import datetime
+from io import BytesIO
+from unittest.mock import MagicMock, Mock, mock_open, patch
+
+import pandas as pd
+import pytest
+from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig, ValidationPolicy
+from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, RecordParseError
+from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
+from airbyte_cdk.sources.file_based.file_types.excel_parser import ExcelParser
+from airbyte_cdk.sources.file_based.remote_file import RemoteFile
+from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
+
+
+@pytest.fixture
+def mock_stream_reader():
+    return Mock(spec=AbstractFileBasedStreamReader)
+
+
+@pytest.fixture
+def mock_logger():
+    return Mock()
+
+
+@pytest.fixture
+def file_config():
+    return FileBasedStreamConfig(
+        name="test.xlsx",
+        file_type="excel",
+        format=ExcelFormat(sheet_name="Sheet1"),
+        validation_policy=ValidationPolicy.emit_record,
+    )
+
+
+@pytest.fixture
+def remote_file():
+    return RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now())
+
+
+@pytest.fixture
+def setup_parser(remote_file):
+    parser = ExcelParser()
+
+    # Sample data for the mock Excel file
+    data = pd.DataFrame({
+        "column1": [1, 2, 3],
+        "column2": ["a", "b", "c"],
+        "column3": [True, False, True],
+        "column4": pd.to_datetime(["2021-01-01", "2022-01-01", "2023-01-01"]),
+    })
+
+    # Convert the DataFrame to an Excel byte stream
+    excel_bytes = BytesIO()
+    with pd.ExcelWriter(excel_bytes, engine="xlsxwriter") as writer:
+        data.to_excel(writer, index=False)
+    excel_bytes.seek(0)
+
+    # Mock the stream_reader's open_file method to return the Excel byte stream
+    stream_reader = MagicMock(spec=AbstractFileBasedStreamReader)
+    stream_reader.open_file.return_value = BytesIO(excel_bytes.read())
+
+    return parser, FileBasedStreamConfig(name="test_stream", format=ExcelFormat()), remote_file, stream_reader, MagicMock(), data
+
+
+@patch("pandas.ExcelFile")
+@pytest.mark.asyncio
+async def test_infer_schema(mock_excel_file, setup_parser):
+    parser, config, file, stream_reader, logger, data = setup_parser
+
+    # Mock the parse method of the pandas ExcelFile object
+    mock_excel_file.return_value.parse.return_value = data
+
+    # Call infer_schema
+    schema = await parser.infer_schema(config, file, stream_reader, logger)
+
+    # Define the expected schema
+    expected_schema: SchemaType = {
+        "column1": {"type": "number"},
+        "column2": {"type": "string"},
+        "column3": {"type": "boolean"},
+        "column4": {"type": "string", "format": "date-time"},
+    }
+
+    # Validate the schema
+    assert schema == expected_schema
+
+    # Assert that the stream_reader's open_file was called correctly
+    stream_reader.open_file.assert_called_once_with(file, parser.file_read_mode, parser.ENCODING, logger)
+
+    # Assert that the logger was not used for warnings/errors
+    logger.info.assert_not_called()
+    logger.error.assert_not_called()
+
+
+def test_invalid_format(mock_stream_reader, mock_logger, remote_file):
+    parser = ExcelParser()
+    invalid_config = FileBasedStreamConfig(
+        name="test.xlsx",
+        file_type="csv",
+        format={"filetype": "csv"},
+        validation_policy=ValidationPolicy.emit_record,
+    )
+
+    with pytest.raises(ConfigValidationError):
+        list(parser.parse_records(invalid_config, remote_file, mock_stream_reader, mock_logger))
+
+
+def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_file):
+    parser = ExcelParser()
+    with patch("builtins.open", mock_open(read_data=b"corrupted data")):
+        with patch("pandas.ExcelFile") as mock_excel:
+            mock_excel.return_value.parse.side_effect = ValueError("Failed to parse file")
+
+            with pytest.raises(RecordParseError):
+                list(parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger))
--- a/airbyte-cdk/python/unit_tests/sources/file_based/in_memory_files_source.py
+++ b/airbyte-cdk/python/unit_tests/sources/file_based/in_memory_files_source.py
@@ -209,3 +209,23 @@ class TemporaryAvroFilesStreamReader(InMemoryFilesStreamReader):
            file_writer.flush()
            fp.seek(0)
            return fp.read()
+
+
+class TemporaryExcelFilesStreamReader(InMemoryFilesStreamReader):
+    """
+    A file reader that writes RemoteFiles to a temporary file and then reads them back.
+    """
+
+    def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
+        return io.BytesIO(self._make_file_contents(file.uri))
+
+    def _make_file_contents(self, file_name: str) -> bytes:
+        contents = self.files[file_name]["contents"]
+        df = pd.DataFrame(contents)
+
+        with io.BytesIO() as fp:
+            writer = pd.ExcelWriter(fp, engine='xlsxwriter')
+            df.to_excel(writer, index=False, sheet_name='Sheet1')
+            writer._save()
+            fp.seek(0)
+            return fp.read()
--- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py
+++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py
@@ -411,6 +411,14 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                            "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
                                            "required": ["filetype"],
                                        },
+                                        {
+                                            "title": "Excel Format",
+                                            "type": "object",
+                                            "properties": {
+                                                "filetype": {"title": "Filetype", "default": "excel", "const": "excel", "type": "string"}
+                                            },
+                                            "required": ["filetype"]
+                                        }
                                    ],
                                },
                                "schemaless": {
--- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/excel_scenarios.py
+++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/excel_scenarios.py
@@ -0,0 +1,424 @@
+#
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+#
+
+import datetime
+
+from unit_tests.sources.file_based.in_memory_files_source import TemporaryExcelFilesStreamReader
+from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
+from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
+
+_single_excel_file = {
+    "a.xlsx": {
+        "contents": [
+            {"col1": "val11", "col2": "val12"},
+            {"col1": "val21", "col2": "val22"},
+        ],
+        "last_modified": "2023-06-05T03:54:07.000Z",
+    }
+}
+
+_multiple_excel_combine_schema_file = {
+    "a.xlsx": {
+        "contents": [
+            {"col_double": 20.02, "col_string": "Robbers", "col_album": "The 1975"},
+            {"col_double": 20.23, "col_string": "Somebody Else", "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"},
+        ],
+        "last_modified": "2023-06-05T03:54:07.000Z",
+    },
+    "b.xlsx": {
+        "contents": [
+            {"col_double": 1975.1975, "col_string": "It's Not Living (If It's Not with You)", "col_song": "Love It If We Made It"},
+            {"col_double": 5791.5791, "col_string": "The 1975", "col_song": "About You"},
+        ],
+        "last_modified": "2023-06-06T03:54:07.000Z",
+    },
+}
+
+_excel_all_types_file = {
+    "a.xlsx": {
+        "contents": [
+            {
+                "col_bool": True,
+                "col_int": 27,
+                "col_long": 1992,
+                "col_float": 999.09723456,
+                "col_string": "Love It If We Made It",
+                "col_date": datetime.date(2022, 5, 29),
+                "col_time_millis": datetime.time(6, 0, 0, 456000),
+                "col_time_micros": datetime.time(12, 0, 0, 456789),
+            }
+        ],
+        "last_modified": "2023-06-05T03:54:07.000Z",
+    }
+}
+
+_multiple_excel_stream_file = {
+    "odesza_songs.xlsx": {
+        "contents": [
+            {"col_title": "Late Night", "col_album": "A_MOMENT_APART", "col_year": 2017, "col_vocals": False},
+            {"col_title": "White Lies", "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True},
+            {"col_title": "Wide Awake", "col_album": "THE_LAST_GOODBYE", "col_year": 2022, "col_vocals": True},
+        ],
+        "last_modified": "2023-06-05T03:54:07.000Z"
+    },
+    "california_festivals.xlsx": {
+        "contents": [
+            {"col_name": "Lightning in a Bottle", "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, "col_attendance": 18000},
+            {"col_name": "Outside Lands", "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, "col_attendance": 220000},
+        ],
+        "last_modified": "2023-06-06T03:54:07.000Z"
+    },
+}
+
+single_excel_scenario = (
+    TestScenarioBuilder()
+    .set_name("single_excel_stream")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "format": {"filetype": "excel"},
+                    "globs": ["*"],
+                    "validation_policy": "Emit Record",
+                }
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_stream_reader(TemporaryExcelFilesStreamReader(files=_single_excel_file, file_type="excel"))
+        .set_file_type("excel")
+    )
+    .set_expected_check_status("SUCCEEDED")
+    .set_expected_records(
+        [
+            {
+                "data": {
+                    "col1": "val11",
+                    "col2": "val12",
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "a.xlsx",
+                },
+                "stream": "stream1",
+            },
+            {
+                "data": {
+                    "col1": "val21",
+                    "col2": "val22",
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "a.xlsx",
+                },
+                "stream": "stream1",
+            },
+        ]
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col1": {"type": ["null", "string"]},
+                            "col2": {"type": ["null", "string"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                    "is_resumable": True,
+                }
+            ]
+        }
+    )
+).build()
+
+multiple_excel_combine_schema_scenario = (
+    TestScenarioBuilder()
+    .set_name("multiple_excel_combine_schema_stream")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "format": {"filetype": "excel"},
+                    "globs": ["*"],
+                    "validation_policy": "Emit Record",
+                }
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_combine_schema_file, file_type="excel"))
+        .set_file_type("excel")
+    )
+    .set_expected_records(
+        [
+            {
+                "data": {
+                    "col_double": 20.02,
+                    "col_string": "Robbers",
+                    "col_album": "The 1975",
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "a.xlsx",
+                },
+                "stream": "stream1",
+            },
+            {
+                "data": {
+                    "col_double": 20.23,
+                    "col_string": "Somebody Else",
+                    "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It",
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "a.xlsx",
+                },
+                "stream": "stream1",
+            },
+            {
+                "data": {
+                    "col_double": 1975.1975,
+                    "col_string": "It's Not Living (If It's Not with You)",
+                    "col_song": "Love It If We Made It",
+                    "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
+                    "_ab_source_file_url": "b.xlsx",
+                },
+                "stream": "stream1",
+            },
+            {
+                "data": {
+                    "col_double": 5791.5791,
+                    "col_string": "The 1975",
+                    "col_song": "About You",
+                    "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
+                    "_ab_source_file_url": "b.xlsx",
+                },
+                "stream": "stream1",
+            },
+        ]
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col_double": {"type": ["null", "number"]},
+                            "col_string": {"type": ["null", "string"]},
+                            "col_album": {"type": ["null", "string"]},
+                            "col_song": {"type": ["null", "string"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                    "is_resumable": True,
+                }
+            ]
+        }
+    )
+).build()
+
+excel_all_types_scenario = (
+    TestScenarioBuilder()
+    .set_name("excel_all_types_stream")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "format": {"filetype": "excel"},
+                    "globs": ["*"],
+                    "validation_policy": "Emit Record",
+                }
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_stream_reader(TemporaryExcelFilesStreamReader(files=_excel_all_types_file, file_type="excel"))
+        .set_file_type("excel")
+    )
+    .set_expected_records(
+        [
+            {
+                "data": {
+                    "col_bool": True,
+                    "col_int": 27,
+                    "col_long": 1992,
+                    "col_float": 999.09723456,
+                    "col_string": "Love It If We Made It",
+                    "col_date": "2022-05-29T00:00:00",
+                    "col_time_millis": "06:00:00.456000",
+                    "col_time_micros": "12:00:00.456789",
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "a.xlsx",
+                },
+                "stream": "stream1",
+            },
+        ]
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col_bool": {"type": ["null", "boolean"]},
+                            "col_int": {"type": ["null", "number"]},
+                            "col_long": {"type": ["null", "number"]},
+                            "col_float": {"type": ["null", "number"]},
+                            "col_string": {"type": ["null", "string"]},
+                            "col_date": {"format": "date-time", "type": ["null", "string"]},
+                            "col_time_millis": {"type": ["null", "string"]},
+                            "col_time_micros": {"type": ["null", "string"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                    "is_resumable": True,
+                }
+            ]
+        }
+    )
+).build()
+
+multiple_streams_excel_scenario = (
+    TestScenarioBuilder()
+    .set_name("multiple_streams_excel_stream")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "songs_stream",
+                    "format": {"filetype": "excel"},
+                    "globs": ["*_songs.xlsx"],
+                    "validation_policy": "Emit Record",
+                },
+                {
+                    "name": "festivals_stream",
+                    "format": {"filetype": "excel"},
+                    "globs": ["*_festivals.xlsx"],
+                    "validation_policy": "Emit Record",
+                },
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_stream_file, file_type="excel"))
+        .set_file_type("excel")
+    )
+    .set_expected_records(
+        [
+            {
+                "data": {
+                    "col_title": "Late Night",
+                    "col_album": "A_MOMENT_APART",
+                    "col_year": 2017,
+                    "col_vocals": False,
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "odesza_songs.xlsx",
+                },
+                "stream": "songs_stream",
+            },
+            {
+                "data": {
+                    "col_title": "White Lies",
+                    "col_album": "IN_RETURN",
+                    "col_year": 2014,
+                    "col_vocals": True,
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "odesza_songs.xlsx",
+                },
+                "stream": "songs_stream",
+            },
+            {
+                "data": {
+                    "col_title": "Wide Awake",
+                    "col_album": "THE_LAST_GOODBYE",
+                    "col_year": 2022,
+                    "col_vocals": True,
+                    "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
+                    "_ab_source_file_url": "odesza_songs.xlsx",
+                },
+                "stream": "songs_stream",
+            },
+            {
+                "data": {
+                    "col_name": "Lightning in a Bottle",
+                    "col_location": "{'country': 'USA', 'state': 'California', 'city': 'Buena Vista Lake'}",
+                    "col_attendance": 18000,
+                    "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
+                    "_ab_source_file_url": "california_festivals.xlsx",
+                },
+                "stream": "festivals_stream",
+            },
+            {
+                "data": {
+                    "col_name": "Outside Lands",
+                    "col_location": "{'country': 'USA', 'state': 'California', 'city': 'San Francisco'}",
+                    "col_attendance": 220000,
+                    "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
+                    "_ab_source_file_url": "california_festivals.xlsx",
+                },
+                "stream": "festivals_stream",
+            },
+        ]
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col_title": {"type": ["null", "string"]},
+                            "col_album": {"type": ["null", "string"]},
+                            "col_year": {"type": ["null", "number"]},
+                            "col_vocals": {"type": ["null", "boolean"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "songs_stream",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                    "is_resumable": True,
+                },
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col_name": {"type": ["null", "string"]},
+                            "col_location":  {"type": ["null", "string"]},
+                            "col_attendance": {"type": ["null", "number"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "festivals_stream",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                    "is_resumable": True,
+                },
+            ]
+        }
+    )
+).build()
--- a/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_scenarios.py
+++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_scenarios.py
@@ -86,6 +86,12 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
    schemaless_with_user_input_schema_fails_connection_check_scenario,
    single_csv_scenario,
 )
+from unit_tests.sources.file_based.scenarios.excel_scenarios import (
+    excel_all_types_scenario,
+    multiple_excel_combine_schema_scenario,
+    multiple_streams_excel_scenario,
+    single_excel_scenario,
+)
 from unit_tests.sources.file_based.scenarios.incremental_scenarios import (
    multi_csv_different_timestamps_scenario,
    multi_csv_include_missing_files_within_history_range,
@@ -232,6 +238,10 @@ discover_success_scenarios = [
    multiple_avro_combine_schema_scenario,
    multiple_streams_avro_scenario,
    avro_file_with_double_as_number_scenario,
+    excel_all_types_scenario,
+    multiple_excel_combine_schema_scenario,
+    multiple_streams_excel_scenario,
+    single_excel_scenario,
    csv_newline_in_values_not_quoted_scenario,
    csv_autogenerate_column_names_scenario,
    parquet_with_invalid_config_scenario,
@@ -299,6 +309,7 @@ check_scenarios = [
    schemaless_with_user_input_schema_fails_connection_check_scenario,
    valid_single_stream_user_input_schema_scenario,
    single_avro_scenario,
+    single_excel_scenario,
    earlier_csv_scenario,
    csv_no_files_scenario,
 ]