* skip non grid sheets * added PR number to docs * fixed Dockerfile * fix doc * fix doc * updated seed file * updated docs * auto-bump connector version --------- Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
221 lines
9.7 KiB
Python
221 lines
9.7 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from typing import Dict, FrozenSet, Iterable, List
|
|
|
|
from airbyte_cdk.logger import AirbyteLogger
|
|
from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
|
|
from google.oauth2 import credentials as client_account
|
|
from google.oauth2 import service_account
|
|
from googleapiclient import discovery
|
|
|
|
from .models.spreadsheet import RowData, Spreadsheet
|
|
from .utils import safe_name_conversion
|
|
|
|
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
|
|
|
|
logger = logging.getLogger("airbyte")
|
|
|
|
|
|
class Helpers(object):
|
|
@staticmethod
|
|
def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
|
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
|
return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
|
|
|
|
@staticmethod
|
|
def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
|
|
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
|
|
return discovery.build("drive", "v3", credentials=creds)
|
|
|
|
@staticmethod
|
|
def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
|
|
auth_type = credentials.pop("auth_type")
|
|
if auth_type == "Service":
|
|
return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
|
|
elif auth_type == "Client":
|
|
return client_account.Credentials.from_authorized_user_info(info=credentials)
|
|
|
|
@staticmethod
|
|
def headers_to_airbyte_stream(logger: AirbyteLogger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
|
|
"""
|
|
Parses sheet headers from the provided row. This method assumes that data is contiguous
|
|
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
|
|
of the headers. For example, if the first row contains "One | Two | | Three" then this method
|
|
will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
|
|
"""
|
|
fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
|
|
if duplicate_fields:
|
|
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}")
|
|
|
|
sheet_json_schema = {
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"type": "object",
|
|
# For simplicity, the type of every cell is a string
|
|
"properties": {field: {"type": "string"} for field in fields},
|
|
}
|
|
|
|
return AirbyteStream(name=sheet_name, json_schema=sheet_json_schema, supported_sync_modes=[SyncMode.full_refresh])
|
|
|
|
@staticmethod
|
|
def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
|
|
fields = []
|
|
duplicate_fields = set()
|
|
for cell_value in header_row_values:
|
|
if cell_value:
|
|
if cell_value in fields:
|
|
duplicate_fields.add(cell_value)
|
|
else:
|
|
fields.append(cell_value)
|
|
else:
|
|
break
|
|
|
|
# Removing all duplicate fields
|
|
if duplicate_fields:
|
|
fields = [field for field in fields if field not in duplicate_fields]
|
|
|
|
return fields, list(duplicate_fields)
|
|
|
|
@staticmethod
|
|
def get_formatted_row_values(row_data: RowData) -> List[str]:
|
|
"""
|
|
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet. It can be a raw
|
|
string input by the user, or the result of a sheets function call.
|
|
"""
|
|
return [value.formattedValue for value in row_data.values]
|
|
|
|
@staticmethod
|
|
def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
|
|
spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
|
|
|
|
# There is only one sheet since we are specifying the sheet in the requested ranges.
|
|
returned_sheets = spreadsheet.sheets
|
|
if len(returned_sheets) != 1:
|
|
raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
|
|
|
|
range_data = returned_sheets[0].data
|
|
if len(range_data) != 1:
|
|
raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
|
|
|
|
all_row_data = range_data[0].rowData
|
|
if not all_row_data:
|
|
# the sheet is empty
|
|
logger.warning(f"The sheet {sheet_name} (ID {spreadsheet_id}) is empty!")
|
|
return []
|
|
|
|
if len(all_row_data) != 1:
|
|
raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
|
|
|
|
first_row_data = all_row_data[0]
|
|
|
|
return Helpers.get_formatted_row_values(first_row_data)
|
|
|
|
@staticmethod
|
|
def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
|
|
sheet_to_column_name = {}
|
|
for configured_stream in catalog.streams:
|
|
stream = configured_stream.stream
|
|
sheet_name = stream.name
|
|
sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
|
|
|
|
return sheet_to_column_name
|
|
|
|
@staticmethod
|
|
def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
|
|
data = {}
|
|
for relevant_index in sorted(column_index_to_name.keys()):
|
|
if relevant_index >= len(cell_values):
|
|
break
|
|
|
|
cell_value = cell_values[relevant_index]
|
|
if cell_value.strip() != "":
|
|
data[column_index_to_name[relevant_index]] = cell_value
|
|
|
|
return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
|
|
|
|
@staticmethod
|
|
def get_available_sheets_to_column_index_to_name(
|
|
client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]], names_conversion: bool = False
|
|
) -> Dict[str, Dict[int, str]]:
|
|
available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
|
|
logger.info(f"Available sheets: {available_sheets}")
|
|
available_sheets_to_column_index_to_name = defaultdict(dict)
|
|
for sheet, columns in requested_sheets_and_columns.items():
|
|
if sheet in available_sheets:
|
|
first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
|
|
if names_conversion:
|
|
first_row = [safe_name_conversion(h) for h in first_row]
|
|
# Find the column index of each header value
|
|
idx = 0
|
|
for cell_value in first_row:
|
|
if cell_value in columns:
|
|
available_sheets_to_column_index_to_name[sheet][idx] = cell_value
|
|
idx += 1
|
|
return available_sheets_to_column_index_to_name
|
|
|
|
@staticmethod
|
|
def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
|
|
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
|
return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
|
|
|
|
@staticmethod
|
|
def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
|
|
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
|
|
# filter out sheets without gridProperties (like in diagram sheets)
|
|
data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
|
|
return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
|
|
|
|
@staticmethod
|
|
def get_grid_sheets(spreadsheet_metadata) -> List[str]:
|
|
"""Return grid only diagram, filter out sheets with image/diagram only
|
|
|
|
https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
|
|
"""
|
|
grid_sheets = []
|
|
non_grid_sheets = []
|
|
for sheet in spreadsheet_metadata.sheets:
|
|
sheet_title = sheet.properties.title
|
|
if (
|
|
hasattr(sheet.properties, "gridProperties")
|
|
and hasattr(sheet.properties, "sheetType")
|
|
and sheet.properties.sheetType == "GRID"
|
|
):
|
|
grid_sheets.append(sheet_title)
|
|
else:
|
|
non_grid_sheets.append(sheet_title)
|
|
|
|
if non_grid_sheets:
|
|
AirbyteLogger().log("WARN", "Skip non-grid sheets: " + ", ".join(non_grid_sheets))
|
|
|
|
return grid_sheets
|
|
|
|
@staticmethod
|
|
def is_row_empty(cell_values: List[str]) -> bool:
|
|
for cell in cell_values:
|
|
if cell.strip() != "":
|
|
return False
|
|
return True
|
|
|
|
@staticmethod
|
|
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
|
|
for idx in relevant_indices:
|
|
if len(cell_values) > idx and cell_values[idx].strip() != "":
|
|
return True
|
|
return False
|
|
|
|
@staticmethod
|
|
def get_spreadsheet_id(id_or_url: str) -> str:
|
|
if re.match(r"(https://)", id_or_url):
|
|
# This is a URL
|
|
m = re.search(r"(/)([-\w]{20,})([/]?)", id_or_url)
|
|
if m is not None and m.group(2):
|
|
return m.group(2)
|
|
else:
|
|
return id_or_url
|