🎉 Add YAML format to source-file reader (#14588)
* Add yaml reader * Update docs * Bumpversion of connector * bump docs * Update pyarrow dependency * Upgrade pandas dependency * auto-bump connector version Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
This commit is contained in:
@@ -271,7 +271,7 @@
|
||||
- name: File
|
||||
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
|
||||
dockerRepository: airbyte/source-file
|
||||
dockerImageTag: 0.2.10
|
||||
dockerImageTag: 0.2.11
|
||||
documentationUrl: https://docs.airbyte.io/integrations/sources/file
|
||||
icon: file.svg
|
||||
sourceType: file
|
||||
|
||||
@@ -2261,7 +2261,7 @@
|
||||
supportsNormalization: false
|
||||
supportsDBT: false
|
||||
supported_destination_sync_modes: []
|
||||
- dockerImage: "airbyte/source-file:0.2.10"
|
||||
- dockerImage: "airbyte/source-file:0.2.11"
|
||||
spec:
|
||||
documentationUrl: "https://docs.airbyte.io/integrations/sources/file"
|
||||
connectionSpecification:
|
||||
@@ -2289,6 +2289,7 @@
|
||||
- "excel"
|
||||
- "feather"
|
||||
- "parquet"
|
||||
- "yaml"
|
||||
default: "csv"
|
||||
title: "File Format"
|
||||
description: "The Format of the file which should be replicated (Warning:\
|
||||
@@ -2300,7 +2301,7 @@
|
||||
\ chosen file format to provide additional options and tune its behavior."
|
||||
examples:
|
||||
- "{}"
|
||||
- "{'sep': ' '}"
|
||||
- "{\"sep\": \" \"}"
|
||||
url:
|
||||
type: "string"
|
||||
title: "URL"
|
||||
|
||||
@@ -17,5 +17,5 @@ COPY source_file ./source_file
|
||||
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
|
||||
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
|
||||
|
||||
LABEL io.airbyte.version=0.2.10
|
||||
LABEL io.airbyte.version=0.2.11
|
||||
LABEL io.airbyte.name=airbyte/source-file
|
||||
|
||||
@@ -31,6 +31,7 @@ def check_read(config, expected_columns=10, expected_rows=42):
|
||||
("excel", "xlsx", 8, 50, "demo"),
|
||||
("feather", "feather", 9, 3, "demo"),
|
||||
("parquet", "parquet", 9, 3, "demo"),
|
||||
("yaml", "yaml", 8, 3, "demo"),
|
||||
],
|
||||
)
|
||||
def test_local_file_read(file_format, extension, expected_columns, expected_rows, filename):
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"streams": [
|
||||
{
|
||||
"stream": {
|
||||
"name": "test",
|
||||
"json_schema": {
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"$schema": "http://json-schema.org/schema#",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"sourceDefinitionId": {"type": "string"},
|
||||
"dockerRepository": {"type": "string"},
|
||||
"dockerImageTag": {"type": "string"},
|
||||
"documentationUrl": {"type": "string"},
|
||||
"icon": {"type": "string"},
|
||||
"sourceType": {"type": "string"},
|
||||
"releaseStage": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"sync_mode": "full_refresh",
|
||||
"destination_sync_mode": "overwrite"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
- name: Facebook Pages
|
||||
sourceDefinitionId: 010eb12f-837b-4685-892d-0a39f76a98f5
|
||||
dockerRepository: airbyte/source-facebook-pages
|
||||
dockerImageTag: 0.1.6
|
||||
documentationUrl: https://docs.airbyte.com/integrations/sources/facebook-pages
|
||||
icon: facebook.svg
|
||||
sourceType: api
|
||||
releaseStage: alpha
|
||||
- name: Faker
|
||||
sourceDefinitionId: dfd88b22-b603-4c3d-aad7-3701784586b1
|
||||
dockerRepository: airbyte/source-faker
|
||||
dockerImageTag: 0.1.5
|
||||
documentationUrl: https://docs.airbyte.com/integrations/source-faker
|
||||
sourceType: api
|
||||
releaseStage: alpha
|
||||
- name: File
|
||||
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
|
||||
dockerRepository: airbyte/source-file
|
||||
dockerImageTag: 0.2.10
|
||||
documentationUrl: https://docs.airbyte.io/integrations/sources/file
|
||||
icon: file.svg
|
||||
sourceType: file
|
||||
releaseStage: alpha
|
||||
@@ -10,14 +10,14 @@ MAIN_REQUIREMENTS = [
|
||||
"gcsfs==0.7.1",
|
||||
"genson==1.2.2",
|
||||
"google-cloud-storage==1.35.0",
|
||||
"pandas==1.2.0",
|
||||
"pandas==1.4.3",
|
||||
"paramiko==2.7.2",
|
||||
"s3fs==0.4.2",
|
||||
"smart-open[all]==4.1.2",
|
||||
"lxml==4.6.5",
|
||||
"html5lib==1.1",
|
||||
"beautifulsoup4==4.9.3",
|
||||
"pyarrow==3.0.0",
|
||||
"pyarrow==8.0.0",
|
||||
"xlrd==2.0.1",
|
||||
"openpyxl==3.0.6",
|
||||
"pyxlsb==1.0.8",
|
||||
|
||||
@@ -19,6 +19,7 @@ from botocore.config import Config
|
||||
from genson import SchemaBuilder
|
||||
from google.cloud.storage import Client as GCSClient
|
||||
from google.oauth2 import service_account
|
||||
from yaml import safe_load
|
||||
|
||||
|
||||
class ConfigurationError(Exception):
|
||||
@@ -265,6 +266,10 @@ class Client:
|
||||
result = [result]
|
||||
return result
|
||||
|
||||
def load_yaml(self, fp):
|
||||
if self._reader_format == "yaml":
|
||||
return pd.DataFrame(safe_load(fp))
|
||||
|
||||
def load_dataframes(self, fp, skip_data=False) -> Iterable:
|
||||
"""load and return the appropriate pandas dataframe.
|
||||
|
||||
@@ -334,6 +339,12 @@ class Client:
|
||||
with self.reader.open(binary=self.binary_source) as fp:
|
||||
if self._reader_format == "json" or self._reader_format == "jsonl":
|
||||
yield from self.load_nested_json(fp)
|
||||
elif self._reader_format == "yaml":
|
||||
fields = set(fields) if fields else None
|
||||
df = self.load_yaml(fp)
|
||||
columns = fields.intersection(set(df.columns)) if fields else df.columns
|
||||
df = df.where(pd.notnull(df), None)
|
||||
yield from df[columns].to_dict(orient="records")
|
||||
else:
|
||||
fields = set(fields) if fields else None
|
||||
for df in self.load_dataframes(fp):
|
||||
@@ -345,8 +356,10 @@ class Client:
|
||||
with self.reader.open(binary=self.binary_source) as fp:
|
||||
if self._reader_format == "json" or self._reader_format == "jsonl":
|
||||
return self.load_nested_json_schema(fp)
|
||||
|
||||
df_list = self.load_dataframes(fp, skip_data=False)
|
||||
elif self._reader_format == "yaml":
|
||||
df_list = [self.load_yaml(fp)]
|
||||
else:
|
||||
df_list = self.load_dataframes(fp, skip_data=False)
|
||||
fields = {}
|
||||
for df in df_list:
|
||||
for col in df.columns:
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet"],
|
||||
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
|
||||
"default": "csv",
|
||||
"title": "File Format",
|
||||
"description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."
|
||||
|
||||
@@ -47,6 +47,7 @@ This source produces a single table for the target file as it replicates only on
|
||||
| Feather | Yes |
|
||||
| Parquet | Yes |
|
||||
| Pickle | No |
|
||||
| YAML | Yes |
|
||||
|
||||
**This connector does not support syncing unstructured data files such as raw text, audio, or videos.**
|
||||
|
||||
@@ -126,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm
|
||||
|
||||
| Version | Date | Pull Request | Subject |
|
||||
| ------- | ---------- | ------------------------------------------------------ | ------------------------------------------------- |
|
||||
| 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588)| Add support to YAML format |
|
||||
| 0.2.9 | 2022-02-01 | [9974](https://github.com/airbytehq/airbyte/pull/9974) | Update airbyte-cdk 0.1.47 |
|
||||
| 0.2.8 | 2021-12-06 | [8524](https://github.com/airbytehq/airbyte/pull/8524) | Update connector fields title/description |
|
||||
| 0.2.7 | 2021-10-28 | [7387](https://github.com/airbytehq/airbyte/pull/7387) | Migrate source to CDK structure, add SAT testing. |
|
||||
|
||||
Reference in New Issue
Block a user