1
0
mirror of synced 2026-01-20 03:07:18 -05:00
Files
airbyte/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py

278 lines
13 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import logging
import socket
from typing import Any, Generator, List, Mapping, MutableMapping, Optional, Union
from airbyte_cdk.models import FailureType
from airbyte_cdk.models.airbyte_protocol import (
AirbyteCatalog,
AirbyteConnectionStatus,
AirbyteMessage,
AirbyteStateMessage,
AirbyteStreamStatus,
ConfiguredAirbyteCatalog,
Status,
Type,
)
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
from airbyte_cdk.sources.source import Source
from airbyte_cdk.sources.streams.checkpoint import FullRefreshCheckpointReader
from airbyte_cdk.utils import AirbyteTracedException
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message
from apiclient import errors
from google.auth import exceptions as google_exceptions
from requests.status_codes import codes as status_codes
from .client import GoogleSheetsClient
from .helpers import Helpers
from .models.spreadsheet import Spreadsheet
from .models.spreadsheet_values import SpreadsheetValues
from .utils import exception_description_by_status_code, safe_name_conversion
# override default socket timeout to be 10 mins instead of 60 sec.
# on behalf of https://github.com/airbytehq/oncall/issues/242
DEFAULT_SOCKET_TIMEOUT: int = 600
socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT)
class SourceGoogleSheets(Source):
"""
Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets
"""
def check(self, logger: logging.Logger, config: json) -> AirbyteConnectionStatus:
# Check involves verifying that the specified spreadsheet is reachable with our credentials.
try:
client = GoogleSheetsClient(self.get_credentials(config))
except Exception as e:
return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}")
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
try:
spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False)
except errors.HttpError as err:
message = "Config error: "
# Give a clearer message if it's a common error like 404.
if err.resp.status == status_codes.NOT_FOUND:
message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
except google_exceptions.GoogleAuthError as err:
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
# Check for duplicate headers
spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet)
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
duplicate_headers_in_sheet = {}
for sheet_name in grid_sheets:
try:
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
if config.get("names_conversion"):
header_row_data = [safe_name_conversion(h) for h in header_row_data]
_, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data)
if duplicate_headers:
duplicate_headers_in_sheet[sheet_name] = duplicate_headers
except Exception as err:
if str(err).startswith("Expected data for exactly one row for sheet"):
logger.warn(f"Skip empty sheet: {sheet_name}")
else:
logger.error(str(err))
return AirbyteConnectionStatus(
status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
)
if duplicate_headers_in_sheet:
duplicate_headers_error_message = ", ".join(
[
f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items()
]
)
return AirbyteConnectionStatus(
status=Status.FAILED,
message="The following duplicate headers were found in the following sheets. Please fix them to continue: "
+ duplicate_headers_error_message,
)
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
def discover(self, logger: logging.Logger, config: json) -> AirbyteCatalog:
client = GoogleSheetsClient(self.get_credentials(config))
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
try:
logger.info(f"Running discovery on sheet {spreadsheet_id}")
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
streams = []
for sheet_name in grid_sheets:
try:
header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
if config.get("names_conversion"):
header_row_data = [safe_name_conversion(h) for h in header_row_data]
stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
streams.append(stream)
except Exception as err:
if str(err).startswith("Expected data for exactly one row for sheet"):
logger.warn(f"Skip empty sheet: {sheet_name}")
else:
logger.error(str(err))
return AirbyteCatalog(streams=streams)
except errors.HttpError as err:
error_description = exception_description_by_status_code(err.resp.status, spreadsheet_id)
config_error_status_codes = [status_codes.NOT_FOUND, status_codes.FORBIDDEN]
if err.resp.status in config_error_status_codes:
message = f"{error_description}. {err.reason}."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
raise Exception(f"Could not discover the schema of your spreadsheet. {error_description}. {err.reason}.")
except google_exceptions.GoogleAuthError as err:
message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access."
raise AirbyteTracedException(
message=message,
internal_message=message,
failure_type=FailureType.config_error,
) from err
def _read(
self,
logger: logging.Logger,
config: json,
catalog: ConfiguredAirbyteCatalog,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Generator[AirbyteMessage, None, None]:
client = GoogleSheetsClient(self.get_credentials(config))
client.Backoff.row_batch_size = config.get("batch_size", 200)
sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
stream_instances = {s.stream.name: s.stream for s in catalog.streams}
state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state or {})
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
# a blank row, emit the row batch
sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion")
)
sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
logger.info(f"Row counts: {sheet_row_counts}")
for sheet in sheet_to_column_index_to_name.keys():
logger.info(f"Syncing sheet {sheet}")
stream = stream_instances.get(sheet)
yield as_airbyte_message(stream, AirbyteStreamStatus.STARTED)
checkpoint_reader = FullRefreshCheckpointReader([])
_ = checkpoint_reader.next()
# We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
if not is_valid:
logger.info(f"Skipping syncing sheet {sheet}: {reason}")
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
yield as_airbyte_message(stream, AirbyteStreamStatus.INCOMPLETE)
continue
column_index_to_name = sheet_to_column_index_to_name[sheet]
row_cursor = 2 # we start syncing past the header row
# For the loop, it is necessary that the initial row exists when we send a request to the API,
# if the last row of the interval goes outside the sheet - this is normal, we will return
# only the real data of the sheet and in the next iteration we will loop out.
while row_cursor <= sheet_row_counts[sheet]:
row_batch = SpreadsheetValues.parse_obj(
client.get_values(
sheet=sheet,
row_cursor=row_cursor,
spreadsheetId=spreadsheet_id,
majorDimension="ROWS",
)
)
row_cursor += client.Backoff.row_batch_size + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]
if not value_ranges.values:
break
row_values = value_ranges.values
if len(row_values) == 0:
break
yield as_airbyte_message(stream, AirbyteStreamStatus.RUNNING)
for row in row_values:
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
yield self._checkpoint_state(checkpoint_reader.get_checkpoint(), state_manager, sheet, None)
yield as_airbyte_message(stream, AirbyteStreamStatus.COMPLETE)
def _checkpoint_state(
self,
stream_state: Mapping[str, Any],
state_manager,
stream_name: str,
stream_namespace: Optional[str],
) -> AirbyteMessage:
state_manager.update_state_for_stream(stream_name, stream_namespace, stream_state)
return state_manager.create_state_message(stream_name, stream_namespace)
def read(
self,
logger: logging.Logger,
config: json,
catalog: ConfiguredAirbyteCatalog,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Generator[AirbyteMessage, None, None]:
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
try:
yield from self._read(logger, config, catalog, state)
except errors.HttpError as e:
error_description = exception_description_by_status_code(e.status_code, spreadsheet_id)
if e.status_code == status_codes.FORBIDDEN:
raise AirbyteTracedException(
message=f"Stopped syncing process. {error_description}",
internal_message=error_description,
failure_type=FailureType.config_error,
) from e
if e.status_code == status_codes.TOO_MANY_REQUESTS:
raise AirbyteTracedException(
message=f"Stopped syncing process due to rate limits. {error_description}",
internal_message=error_description,
failure_type=FailureType.transient_error,
) from e
else:
logger.info(f"{e.status_code}: {e.reason}. {error_description}")
raise AirbyteTracedException(
message=f"Stopped syncing process. {error_description}",
internal_message=error_description,
failure_type=FailureType.transient_error,
) from e
finally:
logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
@staticmethod
def get_credentials(config):
# backward compatible with old style config
if config.get("credentials_json"):
credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")}
return credentials
return config.get("credentials")