# # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # import json import socket from typing import Any, Generator, List, MutableMapping, Union from airbyte_cdk.logger import AirbyteLogger from airbyte_cdk.models import FailureType from airbyte_cdk.models.airbyte_protocol import ( AirbyteCatalog, AirbyteConnectionStatus, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog, Status, Type, ) from airbyte_cdk.sources.source import Source from airbyte_cdk.utils import AirbyteTracedException from apiclient import errors from google.auth import exceptions as google_exceptions from requests.status_codes import codes as status_codes from .client import GoogleSheetsClient from .helpers import Helpers from .models.spreadsheet import Spreadsheet from .models.spreadsheet_values import SpreadsheetValues from .utils import safe_name_conversion # set default batch read size ROW_BATCH_SIZE = 200 # override default socket timeout to be 10 mins instead of 60 sec. # on behalf of https://github.com/airbytehq/oncall/issues/242 DEFAULT_SOCKET_TIMEOUT: int = 600 socket.setdefaulttimeout(DEFAULT_SOCKET_TIMEOUT) class SourceGoogleSheets(Source): """ Spreadsheets API Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets """ def check(self, logger: AirbyteLogger, config: json) -> AirbyteConnectionStatus: # Check involves verifying that the specified spreadsheet is reachable with our credentials. try: client = GoogleSheetsClient(self.get_credentials(config)) except Exception as e: return AirbyteConnectionStatus(status=Status.FAILED, message=f"Please use valid credentials json file. Error: {e}") spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"]) try: spreadsheet = client.get(spreadsheetId=spreadsheet_id, includeGridData=False) except errors.HttpError as err: message = "Config error: " # Give a clearer message if it's a common error like 404. if err.resp.status == status_codes.NOT_FOUND: message += "The spreadsheet link is not valid. Enter the URL of the Google spreadsheet you want to sync." raise AirbyteTracedException( message=message, internal_message=message, failure_type=FailureType.config_error, ) from err except google_exceptions.GoogleAuthError as err: message = "Access to the spreadsheet expired or was revoked. Re-authenticate to restore access." raise AirbyteTracedException( message=message, internal_message=message, failure_type=FailureType.config_error, ) from err # Check for duplicate headers spreadsheet_metadata = Spreadsheet.parse_obj(spreadsheet) grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata) duplicate_headers_in_sheet = {} for sheet_name in grid_sheets: try: header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name) if config.get("names_conversion"): header_row_data = [safe_name_conversion(h) for h in header_row_data] _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(header_row_data) if duplicate_headers: duplicate_headers_in_sheet[sheet_name] = duplicate_headers except Exception as err: if str(err).startswith("Expected data for exactly one row for sheet"): logger.warn(f"Skip empty sheet: {sheet_name}") else: logger.error(str(err)) return AirbyteConnectionStatus( status=Status.FAILED, message=f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}" ) if duplicate_headers_in_sheet: duplicate_headers_error_message = ", ".join( [ f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]" for sheet_name, duplicate_sheet_headers in duplicate_headers_in_sheet.items() ] ) return AirbyteConnectionStatus( status=Status.FAILED, message="The following duplicate headers were found in the following sheets. Please fix them to continue: " + duplicate_headers_error_message, ) return AirbyteConnectionStatus(status=Status.SUCCEEDED) def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = GoogleSheetsClient(self.get_credentials(config)) spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"]) try: logger.info(f"Running discovery on sheet {spreadsheet_id}") spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False)) grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata) streams = [] for sheet_name in grid_sheets: try: header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name) if config.get("names_conversion"): header_row_data = [safe_name_conversion(h) for h in header_row_data] stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data) streams.append(stream) except Exception as err: if str(err).startswith("Expected data for exactly one row for sheet"): logger.warn(f"Skip empty sheet: {sheet_name}") else: logger.error(str(err)) return AirbyteCatalog(streams=streams) except errors.HttpError as err: reason = str(err) if err.resp.status == status_codes.NOT_FOUND: reason = "Requested spreadsheet was not found." raise Exception(f"Could not run discovery: {reason}") def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None, ) -> Generator[AirbyteMessage, None, None]: client = GoogleSheetsClient(self.get_credentials(config)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog) spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"]) row_batch_size = config.get("row_batch_size", ROW_BATCH_SIZE) logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name, config.get("names_conversion") ) sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id) logger.info(f"Row counts: {sheet_row_counts}") for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row # For the loop, it is necessary that the initial row exists when we send a request to the API, # if the last row of the interval goes outside the sheet - this is normal, we will return # only the real data of the sheet and in the next iteration we will loop out. while row_cursor <= sheet_row_counts[sheet]: range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS") ) row_cursor += row_batch_size + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()): yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}") @staticmethod def get_credentials(config): # backward compatible with old style config if config.get("credentials_json"): credentials = {"auth_type": "Service", "service_account_info": config.get("credentials_json")} return credentials return config.get("credentials")