1
0
mirror of synced 2026-01-10 18:02:07 -05:00
Files
airbyte/airbyte-integrations/connectors/source-s3/source_s3/source.py
Artem Inzhyyants 3080f65429 Source S3: Add start date filter for files (#25010)
* Source S3: Add start date filter for files

* Source S3: add docs

* Source S3: add unittest

* Source S3: add unittest

* Source S3: add unittest

* Source S3: Fix spec test

* Source S3: bump version

* Source S3: fix tests

* Source S3: fix description

* auto-bump connector version

* Source S3: refactor start_date filtering

* Source S3: update setup

* Source S3: serialize state for cache

* Source S3: refactor skip file filter

* Source S3: bump version + update docs

* auto-bump connector version

---------

Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
2023-04-18 14:07:15 +02:00

73 lines
3.0 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
from typing import Any, Mapping, Optional
from pydantic import BaseModel, Field
from .source_files_abstract.source import SourceFilesAbstract
from .source_files_abstract.spec import SourceFilesAbstractSpec
from .stream import IncrementalFileStreamS3
class SourceS3Spec(SourceFilesAbstractSpec, BaseModel):
class Config:
title = "S3 Source Spec"
class S3Provider(BaseModel):
class Config:
title = "S3: Amazon Web Services"
# SourceFilesAbstractSpec field are ordered 10 apart to allow subclasses to insert their own spec's fields interspersed
schema_extra = {"order": 11, "description": "Use this to load files from S3 or S3-compatible services"}
bucket: str = Field(description="Name of the S3 bucket where the file(s) exist.", order=0)
aws_access_key_id: Optional[str] = Field(
title="AWS Access Key ID",
default=None,
description="In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper "
"permissions. If accessing publicly available data, this field is not necessary.",
airbyte_secret=True,
order=1,
)
aws_secret_access_key: Optional[str] = Field(
title="AWS Secret Access Key",
default=None,
description="In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper "
"permissions. If accessing publicly available data, this field is not necessary.",
airbyte_secret=True,
order=2,
)
path_prefix: str = Field(
default="",
description="By providing a path-like prefix (e.g. myFolder/thisTable/) under which all the relevant files sit, "
"we can optimize finding these in S3. This is optional but recommended if your bucket contains many "
"folders/files which you don't need to replicate.",
order=3,
)
endpoint: str = Field("", description="Endpoint to an S3 compatible service. Leave empty to use AWS.", order=4)
start_date: Optional[str] = Field(
title="Start Date",
description="UTC date and time in the format 2017-01-25T00:00:00Z. Any file modified before this date will not be replicated.",
examples=["2021-01-01T00:00:00Z"],
format="date-time",
pattern="^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$",
order=5,
)
provider: S3Provider
class SourceS3(SourceFilesAbstract):
stream_class = IncrementalFileStreamS3
spec_class = SourceS3Spec
documentation_url = "https://docs.airbyte.com/integrations/sources/s3"
def read_config(self, config_path: str) -> Mapping[str, Any]:
config: Mapping[str, Any] = super().read_config(config_path)
if config.get("format", {}).get("delimiter") == r"\t":
config["format"]["delimiter"] = "\t"
return config