airbyte/airbyte-integrations/connectors/source-s3/integration_tests/spec.json

{
  "documentationUrl": "https://docs.airbyte.com/integrations/sources/s3",
  "changelogUrl": "https://docs.airbyte.com/integrations/sources/s3",
  "connectionSpecification": {
    "title": "S3 Source Spec",
    "type": "object",
    "properties": {
      "dataset": {
        "title": "Output Stream Name",
        "description": "The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.",
        "pattern": "^([A-Za-z0-9-_]+)$",
        "order": 0,
        "type": "string"
      },
      "path_pattern": {
        "title": "Pattern of files to replicate",
        "description": "A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See <a href=\"https://facelessuser.github.io/wcmatch/glob/\" target=\"_blank\">this page</a> to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern <strong>**</strong> to pick up all files.",
        "examples": [
          "**",
          "myFolder/myTableFiles/*.csv|myFolder/myOtherTableFiles/*.csv"
        ],
        "order": 10,
        "type": "string"
      },
      "format": {
        "title": "File Format",
        "description": "The format of the files you'd like to replicate",
        "default": "csv",
        "order": 20,
        "type": "object",
        "oneOf": [
          {
            "title": "CSV",
            "description": "This connector utilises <a href=\"https: // arrow.apache.org/docs/python/generated/pyarrow.csv.open_csv.html\" target=\"_blank\">PyArrow (Apache Arrow)</a> for CSV parsing.",
            "type": "object",
            "properties": {
              "filetype": {
                "title": "Filetype",
                "const": "csv",
                "type": "string"
              },
              "delimiter": {
                "title": "Delimiter",
                "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
                "default": ",",
                "minLength": 1,
                "order": 0,
                "type": "string"
              },
              "infer_datatypes": {
                "title": "Infer Datatypes",
                "description": "Configures whether a schema for the source should be inferred from the current data or not. If set to false and a custom schema is set, then the manually enforced schema is used. If a schema is not manually set, and this is set to false, then all fields will be read as strings",
                "default": true,
                "order": 1,
                "type": "boolean"
              },
              "quote_char": {
                "title": "Quote Character",
                "description": "The character used for quoting CSV values. To disallow quoting, make this field blank.",
                "default": "\"",
                "order": 2,
                "type": "string"
              },
              "escape_char": {
                "title": "Escape Character",
                "description": "The character used for escaping special characters. To disallow escaping, leave this field blank.",
                "order": 3,
                "type": "string"
              },
              "encoding": {
                "title": "Encoding",
                "description": "The character encoding of the CSV data. Leave blank to default to <strong>UTF8</strong>. See <a href=\"https://docs.python.org/3/library/codecs.html#standard-encodings\" target=\"_blank\">list of python encodings</a> for allowable options.",
                "default": "utf8",
                "order": 4,
                "type": "string"
              },
              "double_quote": {
                "title": "Double Quote",
                "description": "Whether two quotes in a quoted CSV value denote a single quote in the data.",
                "default": true,
                "order": 5,
                "type": "boolean"
              },
              "newlines_in_values": {
                "title": "Allow newlines in values",
                "description": "Whether newline characters are allowed in CSV values. Turning this on may affect performance. Leave blank to default to False.",
                "default": false,
                "order": 6,
                "type": "boolean"
              },
              "additional_reader_options": {
                "title": "Additional Reader Options",
                "description": "Optionally add a valid JSON string here to provide additional options to the csv reader. Mappings must correspond to options <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions\" target=\"_blank\">detailed here</a>. 'column_types' is used internally to handle schema so overriding that would likely cause problems.",
                "examples": [
                  "{\"timestamp_parsers\": [\"%m/%d/%Y %H:%M\", \"%Y/%m/%d %H:%M\"], \"strings_can_be_null\": true, \"null_values\": [\"NA\", \"NULL\"]}"
                ],
                "order": 7,
                "type": "string"
              },
              "advanced_options": {
                "title": "Advanced Options",
                "description": "Optionally add a valid JSON string here to provide additional <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions\" target=\"_blank\">Pyarrow ReadOptions</a>. Specify 'column_names' here if your CSV doesn't have header, or if you want to use custom column names. 'block_size' and 'encoding' are already used above, specify them again here will override the values above.",
                "examples": ["{\"column_names\": [\"column1\", \"column2\"]}"],
                "order": 8,
                "type": "string"
              },
              "block_size": {
                "title": "Block Size",
                "description": "The chunk size in bytes to process at a time in memory from each file. If your data is particularly wide and failing during schema detection, increasing this should solve it. Beware of raising this too high as you could hit OOM errors.",
                "default": 10000,
                "minimum": 1,
                "maximum": 2147483647,
                "order": 9,
                "type": "integer"
              }
            }
          },
          {
            "title": "Parquet",
            "description": "This connector utilises <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html\" target=\"_blank\">PyArrow (Apache Arrow)</a> for Parquet parsing.",
            "type": "object",
            "properties": {
              "filetype": {
                "title": "Filetype",
                "const": "parquet",
                "type": "string"
              },
              "columns": {
                "title": "Selected Columns",
                "description": "If you only want to sync a subset of the columns from the file(s), add the columns you want here as a comma-delimited list. Leave it empty to sync all columns.",
                "order": 0,
                "type": "array",
                "items": {
                  "type": "string"
                }
              },
              "batch_size": {
                "title": "Record batch size",
                "description": "Maximum number of records per batch read from the input files. Batches may be smaller if there aren’t enough rows in the file. This option can help avoid out-of-memory errors if your data is particularly wide.",
                "default": 65536,
                "order": 1,
                "type": "integer"
              },
              "buffer_size": {
                "title": "Buffer Size",
                "description": "Perform read buffering when deserializing individual column chunks. By default every group column will be loaded fully to memory. This option can help avoid out-of-memory errors if your data is particularly wide.",
                "default": 2,
                "type": "integer"
              }
            }
          },
          {
            "title": "Avro",
            "description": "This connector utilises <a href=\"https://fastavro.readthedocs.io/en/latest/\" target=\"_blank\">fastavro</a> for Avro parsing.",
            "type": "object",
            "properties": {
              "filetype": {
                "title": "Filetype",
                "const": "avro",
                "type": "string"
              }
            }
          },
          {
            "title": "Jsonl",
            "description": "This connector uses <a href=\"https://arrow.apache.org/docs/python/json.html\" target=\"_blank\">PyArrow</a> for JSON Lines (jsonl) file parsing.",
            "type": "object",
            "properties": {
              "filetype": {
                "title": "Filetype",
                "const": "jsonl",
                "type": "string"
              },
              "newlines_in_values": {
                "title": "Allow newlines in values",
                "description": "Whether newline characters are allowed in JSON values. Turning this on may affect performance. Leave blank to default to False.",
                "default": false,
                "order": 0,
                "type": "boolean"
              },
              "unexpected_field_behavior": {
                "title": "Unexpected field behavior",
                "description": "How JSON fields outside of explicit_schema (if given) are treated. Check <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.json.ParseOptions.html\" target=\"_blank\">PyArrow documentation</a> for details",
                "default": "infer",
                "examples": ["ignore", "infer", "error"],
                "order": 1,
                "enum": ["ignore", "infer", "error"]
              },
              "block_size": {
                "title": "Block Size",
                "description": "The chunk size in bytes to process at a time in memory from each file. If your data is particularly wide and failing during schema detection, increasing this should solve it. Beware of raising this too high as you could hit OOM errors.",
                "default": 0,
                "order": 2,
                "type": "integer"
              }
            }
          }
        ]
      },
      "schema": {
        "title": "Manually enforced data schema",
        "description": "Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of <strong>{ \"column\" : \"type\" }</strong>, where types are valid <a href=\"https://json-schema.org/understanding-json-schema/reference/type.html\" target=\"_blank\">JSON Schema datatypes</a>. Leave as {} to auto-infer the schema.",
        "default": "{}",
        "examples": [
          "{\"column_1\": \"number\", \"column_2\": \"string\", \"column_3\": \"array\", \"column_4\": \"object\", \"column_5\": \"boolean\"}"
        ],
        "order": 30,
        "type": "string"
      },
      "provider": {
        "title": "S3: Amazon Web Services",
        "type": "object",
        "properties": {
          "bucket": {
            "title": "Bucket",
            "description": "Name of the S3 bucket where the file(s) exist.",
            "order": 0,
            "type": "string"
          },
          "aws_access_key_id": {
            "title": "AWS Access Key ID",
            "description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
            "airbyte_secret": true,
            "order": 1,
            "type": "string"
          },
          "aws_secret_access_key": {
            "title": "AWS Secret Access Key",
            "description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
            "airbyte_secret": true,
            "order": 2,
            "type": "string"
          },
          "path_prefix": {
            "title": "Path Prefix",
            "description": "By providing a path-like prefix (e.g. myFolder/thisTable/) under which all the relevant files sit, we can optimize finding these in S3. This is optional but recommended if your bucket contains many folders/files which you don't need to replicate.",
            "default": "",
            "order": 3,
            "type": "string"
          },
          "endpoint": {
            "title": "Endpoint",
            "description": "Endpoint to an S3 compatible service. Leave empty to use AWS.",
            "default": "",
            "order": 4,
            "type": "string"
          },
          "start_date": {
            "title": "Start Date",
            "description": "UTC date and time in the format 2017-01-25T00:00:00Z. Any file modified before this date will not be replicated.",
            "examples": ["2021-01-01T00:00:00Z"],
            "format": "date-time",
            "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$",
            "order": 5,
            "type": "string"
          }
        },
        "required": ["bucket"],
        "order": 11,
        "description": "Use this to load files from S3 or S3-compatible services"
      }
    },
    "required": ["dataset", "path_pattern", "provider"]
  },
  "supportsIncremental": true,
  "supported_destination_sync_modes": ["overwrite", "append", "append_dedup"]
}