S3 and Azure Blob Storage: Update File CDK to support document file types (#31904)
Co-authored-by: alafanechere <augustin.lafanechere@gmail.com>
This commit is contained in:
@@ -53,6 +53,11 @@ acceptance_tests:
|
||||
expect_records:
|
||||
path: integration_tests/expected_records/jsonl_newlines.jsonl
|
||||
exact_order: true
|
||||
- config_path: secrets/unstructured_config.json
|
||||
expect_records:
|
||||
path: integration_tests/expected_records/unstructured.jsonl
|
||||
exact_order: true
|
||||
timeout_seconds: 1800
|
||||
connection:
|
||||
tests:
|
||||
- config_path: secrets/config.json
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "# Heading\n\nThis is the content which is not just a single word", "document_key": "Testdoc.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc.pdf"}, "emitted_at": 1698666216334}
|
||||
{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "This is a test", "document_key": "Testdoc_OCR.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc_OCR.pdf"}, "emitted_at": 1698666218048}
|
||||
@@ -268,6 +268,19 @@
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Document File Type Format (Experimental)",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filetype": {
|
||||
"title": "Filetype",
|
||||
"default": "unstructured",
|
||||
"const": "unstructured",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -3,11 +3,11 @@ data:
|
||||
ql: 100
|
||||
sl: 100
|
||||
connectorBuildOptions:
|
||||
baseImage: docker.io/airbyte/python-connector-base:1.1.0@sha256:bd98f6505c6764b1b5f99d3aedc23dfc9e9af631a62533f60eb32b1d3dbab20c
|
||||
baseImage: docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9
|
||||
connectorSubtype: file
|
||||
connectorType: source
|
||||
definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093
|
||||
dockerImageTag: 0.2.1
|
||||
dockerImageTag: 0.2.2
|
||||
dockerRepository: airbyte/source-azure-blob-storage
|
||||
documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage
|
||||
githubIssueLabel: source-azure-blob-storage
|
||||
|
||||
@@ -5,7 +5,11 @@
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
MAIN_REQUIREMENTS = ["airbyte-cdk>=0.51.17", "smart_open[azure]", "pytz", "fastavro==1.4.11", "pyarrow"]
|
||||
MAIN_REQUIREMENTS = [
|
||||
"airbyte-cdk[file-based]>=0.52.7",
|
||||
"smart_open[azure]",
|
||||
"pytz",
|
||||
]
|
||||
|
||||
TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.2"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user