1
0
mirror of synced 2026-01-01 00:02:54 -05:00
Files
airbyte/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py
Artem Inzhyyants 0eefa33bcb Source Google Sheets: handle config errors (#26097)
* Source Google Sheets: handle config errors

* Source Google Sheets: update docs

* Source Google Sheets: fix test

* Source Google Sheets: add unit tests

* auto-bump connector version

* Automated Change

---------

Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
Co-authored-by: artem1205 <artem1205@users.noreply.github.com>
2023-05-17 00:14:26 +02:00

198 lines
9.1 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import socket
from typing import Any, Generator, List, MutableMapping, Union
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import FailureType
from airbyte_cdk.models.airbyte_protocol import (
AirbyteCatalog,
AirbyteConnectionStatus,
AirbyteMessage,
AirbyteStateMessage,
ConfiguredAirbyteCatalog,
Status,
Type,
)
from airbyte_cdk.sources.source import Source
from airbyte_cdk.utils import AirbyteTracedException
from apiclient import errors
from google.auth import exceptions as google_exceptions
from requests.status_codes import codes as status_codes
from .client import GoogleSheetsClient
from .helpers import Helpers
from .models.spreadsheet import Spreadsheet
from .models.spreadsheet_values import SpreadsheetValues
from .utils import safe_name_conversion
# set default batch read size
ROW_BATCH_SIZE = 200
# override default socket timeout to be 10 mins instead of 60 sec.
# on behalf of https://github.com/airbytehq/oncall/issues/242
DEFAULT_SOCKET_TIMEOUT: int = 600
socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
class SourceGoogleSheets(Source):
"""
Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
"""
def check(self, logger: AirbyteLogger, config: json) -> AirbyteConnectionStatus:
# Check involves verifying that the specified spreadsheet is reachable with our credentials.
try:
client = GoogleSheetsClient(self.get_credentials(config))
except Exception as e:
return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
try:
spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
except errors.HttpError as err:
message = "Config error: "
# Give a clearer message if it's a common error like 404.
if err.resp.status == status_codes.NOT_FOUND:
message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
except google_exceptions.GoogleAuthError as err:
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
# Check for duplicate headers
spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
duplicate_headers_in_sheet = {}
for sheet_name in grid_sheets:
try:
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
if config.get("names_conversion"):
header_row_data = [safe_name_conversion(h) for h in header_row_data]
_, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
if duplicate_headers:
duplicate_headers_in_sheet[sheet_name] = duplicate_headers
except Exception as err:
if str(err).startswith("Expected data for exactly one row for sheet"):
logger.warn(f"Skip empty sheet: {sheet_name}")
else:
logger.error(str(err))
return AirbyteConnectionStatus(
status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
)
if duplicate_headers_in_sheet:
duplicate_headers_error_message = ", ".join(
[
f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
]
)
return AirbyteConnectionStatus(
status=Status.FAILED,
message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
+ duplicate_headers_error_message,
)
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
client = GoogleSheetsClient(self.get_credentials(config))
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
try:
logger.info(f"Running discovery on sheet {spreadsheet_id}")
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
streams = []
for sheet_name in grid_sheets:
try:
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
if config.get("names_conversion"):
header_row_data = [safe_name_conversion(h) for h in header_row_data]
stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
streams.append(stream)
except Exception as err:
if str(err).startswith("Expected data for exactly one row for sheet"):
logger.warn(f"Skip empty sheet: {sheet_name}")
else:
logger.error(str(err))
return AirbyteCatalog(streams=streams)
except errors.HttpError as err:
reason = str(err)
if err.resp.status == status_codes.NOT_FOUND:
reason = "Requested spreadsheet was not found."
raise Exception(f"Could not run discovery: {reason}")
def read(
self,
logger: AirbyteLogger,
config: json,
catalog: ConfiguredAirbyteCatalog,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Generator[AirbyteMessage, None, None]:
client = GoogleSheetsClient(self.get_credentials(config))
sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
row_batch_size = config.get("row_batch_size", ROW_BATCH_SIZE)
logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
# a blank row, emit the row batch
sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
)
sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
logger.info(f"Row counts: {sheet_row_counts}")
for sheet in sheet_to_column_index_to_name.keys():
logger.info(f"Syncing sheet {sheet}")
column_index_to_name = sheet_to_column_index_to_name[sheet]
row_cursor = 2 # we start syncing past the header row
# For the loop, it is necessary that the initial row exists when we send a request to the API,
# if the last row of the interval goes outside the sheet - this is normal, we will return
# only the real data of the sheet and in the next iteration we will loop out.
while row_cursor <= sheet_row_counts[sheet]:
range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}"
logger.info(f"Fetching range {range}")
row_batch = SpreadsheetValues.parse_obj(
client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")
)
row_cursor += row_batch_size + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]
if not value_ranges.values:
break
row_values = value_ranges.values
if len(row_values) == 0:
break
for row in row_values:
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
@staticmethod
def get_credentials(config):
# backward compatible with old style config
if config.get("credentials_json"):
credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
return credentials
return config.get("credentials")