1
0
mirror of synced 2025-12-25 02:09:19 -05:00

S3 and Azure Blob Storage: Update File CDK to support document file types (#31904)

Co-authored-by: alafanechere <augustin.lafanechere@gmail.com>
This commit is contained in:
Joe Reuter
2023-10-31 11:21:22 +01:00
committed by GitHub
parent df98303ff5
commit 7c7acade71
15 changed files with 56 additions and 395 deletions

View File

@@ -53,6 +53,11 @@ acceptance_tests:
expect_records:
path: integration_tests/expected_records/jsonl_newlines.jsonl
exact_order: true
- config_path: secrets/unstructured_config.json
expect_records:
path: integration_tests/expected_records/unstructured.jsonl
exact_order: true
timeout_seconds: 1800
connection:
tests:
- config_path: secrets/config.json

View File

@@ -0,0 +1,2 @@
{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "# Heading\n\nThis is the content which is not just a single word", "document_key": "Testdoc.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc.pdf"}, "emitted_at": 1698666216334}
{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "This is a test", "document_key": "Testdoc_OCR.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc_OCR.pdf"}, "emitted_at": 1698666218048}

View File

@@ -268,6 +268,19 @@
"type": "boolean"
}
}
},
{
"title": "Document File Type Format (Experimental)",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "unstructured",
"const": "unstructured",
"type": "string"
}
},
"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
}
]
},

View File

@@ -3,11 +3,11 @@ data:
ql: 100
sl: 100
connectorBuildOptions:
baseImage: docker.io/airbyte/python-connector-base:1.1.0@sha256:bd98f6505c6764b1b5f99d3aedc23dfc9e9af631a62533f60eb32b1d3dbab20c
baseImage: docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9
connectorSubtype: file
connectorType: source
definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093
dockerImageTag: 0.2.1
dockerImageTag: 0.2.2
dockerRepository: airbyte/source-azure-blob-storage
documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage
githubIssueLabel: source-azure-blob-storage

View File

@@ -5,7 +5,11 @@
from setuptools import find_packages, setup
MAIN_REQUIREMENTS = ["airbyte-cdk>=0.51.17", "smart_open[azure]", "pytz", "fastavro==1.4.11", "pyarrow"]
MAIN_REQUIREMENTS = [
"airbyte-cdk[file-based]>=0.52.7",
"smart_open[azure]",
"pytz",
]
TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.2"]