1
0
mirror of synced 2026-01-22 19:01:47 -05:00
Files
airbyte/airbyte-integrations/connectors/source-google-sheets/google_sheets_source/helpers.py
Mahdi Dibaiee 44ac470d28 Google Sheets: add row_id to rows and use as primary key (#19215)
* source-google-sheets: add row_id to rows and use as primary key

* Update Dockerfile

* Update google-sheets.md

* Update Dockerfile

* Update google-sheets.md

* auto-bump connector version

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
2022-11-14 13:42:00 -05:00

219 lines
9.5 KiB
Python

#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#
import json
import logging
import re
from collections import defaultdict
from datetime import datetime
from typing import Dict, FrozenSet, Iterable, List
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
from google.oauth2 import credentials as client_account
from google.oauth2 import service_account
from googleapiclient import discovery
from .models.spreadsheet import RowData, Spreadsheet
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly", "https://www.googleapis.com/auth/drive.readonly"]
logger = logging.getLogger("airbyte")
class Helpers(object):
@staticmethod
def get_authenticated_sheets_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
return discovery.build("sheets", "v4", credentials=creds).spreadsheets()
@staticmethod
def get_authenticated_drive_client(credentials: Dict[str, str], scopes: List[str] = SCOPES) -> discovery.Resource:
creds = Helpers.get_authenticated_google_credentials(credentials, scopes)
return discovery.build("drive", "v3", credentials=creds)
@staticmethod
def get_authenticated_google_credentials(credentials: Dict[str, str], scopes: List[str] = SCOPES):
auth_type = credentials.pop("auth_type")
if auth_type == "Service":
return service_account.Credentials.from_service_account_info(json.loads(credentials["service_account_info"]), scopes=scopes)
elif auth_type == "Client":
return client_account.Credentials.from_authorized_user_info(info=credentials)
@staticmethod
def headers_to_airbyte_stream(logger: AirbyteLogger, sheet_name: str, header_row_values: List[str]) -> AirbyteStream:
"""
Parses sheet headers from the provided row. This method assumes that data is contiguous
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
of the headers. For example, if the first row contains "One | Two | | Three" then this method
will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
"""
fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
if duplicate_fields:
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}")
props = {field: {"type": "string"} for field in fields}
props["row_id"] = {"type": "integer"}
sheet_json_schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["row_id"],
# For simplicity, the type of every cell is a string
"properties": props,
}
return AirbyteStream(
name=sheet_name,
json_schema=sheet_json_schema,
supported_sync_modes=[SyncMode.full_refresh],
source_defined_primary_key=[["row_id"]],
)
@staticmethod
def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str], List[str]):
fields = []
duplicate_fields = set()
for cell_value in header_row_values:
if cell_value:
if cell_value in fields:
duplicate_fields.add(cell_value)
else:
fields.append(cell_value)
else:
break
# Removing all duplicate fields
if duplicate_fields:
fields = [field for field in fields if field not in duplicate_fields]
return fields, list(duplicate_fields)
@staticmethod
def get_formatted_row_values(row_data: RowData) -> List[str]:
"""
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet. It can be a raw
string input by the user, or the result of a sheets function call.
"""
return [value.formattedValue for value in row_data.values]
@staticmethod
def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
spreadsheet = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet_name}!1:1"))
# There is only one sheet since we are specifying the sheet in the requested ranges.
returned_sheets = spreadsheet.sheets
if len(returned_sheets) != 1:
raise Exception(f"Unexpected return result: Sheet {sheet_name} was expected to contain data on exactly 1 sheet. ")
range_data = returned_sheets[0].data
if len(range_data) != 1:
raise Exception(f"Expected data for exactly one range for sheet {sheet_name}")
all_row_data = range_data[0].rowData
if not all_row_data or len(all_row_data) != 1:
raise Exception(f"Expected data for exactly one row for sheet {sheet_name}")
first_row_data = all_row_data[0]
return Helpers.get_formatted_row_values(first_row_data)
@staticmethod
def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
sheet_to_column_name = {}
for configured_stream in catalog.streams:
stream = configured_stream.stream
sheet_name = stream.name
sheet_to_column_name[sheet_name] = frozenset(stream.json_schema["properties"].keys())
return sheet_to_column_name
@staticmethod
def row_data_to_record_message(
sheet_name: str, row_id: int, cell_values: List[str], column_index_to_name: Dict[int, str]
) -> AirbyteRecordMessage:
data = {"row_id": row_id}
for relevant_index in sorted(column_index_to_name.keys()):
if relevant_index >= len(cell_values):
break
cell_value = cell_values[relevant_index]
if cell_value.strip() != "":
data[column_index_to_name[relevant_index]] = cell_value
return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
@staticmethod
def get_available_sheets_to_column_index_to_name(
client, spreadsheet_id: str, requested_sheets_and_columns: Dict[str, FrozenSet[str]]
) -> Dict[str, Dict[int, str]]:
available_sheets = Helpers.get_sheets_in_spreadsheet(client, spreadsheet_id)
logger.info(f"Available sheets: {available_sheets}")
available_sheets_to_column_index_to_name = defaultdict(dict)
for sheet, columns in requested_sheets_and_columns.items():
if sheet in available_sheets:
first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
# Find the column index of each header value
idx = 0
for cell_value in first_row:
if cell_value in columns:
available_sheets_to_column_index_to_name[sheet][idx] = cell_value
idx += 1
return available_sheets_to_column_index_to_name
@staticmethod
def get_sheets_in_spreadsheet(client, spreadsheet_id: str) -> List[str]:
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
return [sheet.properties.title for sheet in spreadsheet_metadata.sheets]
@staticmethod
def get_sheet_row_count(client, spreadsheet_id: str) -> Dict[str, int]:
spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
# filter out sheets without gridProperties (like in diagram sheets)
data_sheets = [sheet for sheet in spreadsheet_metadata.sheets if hasattr(sheet.properties, "gridProperties")]
return {sheet.properties.title: sheet.properties.gridProperties["rowCount"] for sheet in data_sheets}
@staticmethod
def get_grid_sheets(spreadsheet_metadata) -> List[str]:
"""Return grid only diagram, filter out sheets with image/diagram only
https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets#sheetproperties
"""
grid_sheets = []
non_grid_sheets = []
for sheet in spreadsheet_metadata.sheets:
sheet_title = sheet.properties.title
if hasattr(sheet.properties, "gridProperties"):
grid_sheets.append(sheet_title)
else:
non_grid_sheets.append(sheet_title)
if non_grid_sheets:
AirbyteLogger().log("WARN", "Skip non-grid sheets: " + "".join(non_grid_sheets))
return grid_sheets
@staticmethod
def is_row_empty(cell_values: List[str]) -> bool:
for cell in cell_values:
if cell.strip() != "":
return False
return True
@staticmethod
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
for idx in relevant_indices:
if len(cell_values) > idx and cell_values[idx].strip() != "":
return True
return False
@staticmethod
def get_spreadsheet_id(id_or_url: str) -> str:
if re.match(r"(http://)|(https://)", id_or_url):
# This is a URL
m = re.search(r"(/)([-\w]{40,})([/]?)", id_or_url)
if m is not None and m.group(2):
return m.group(2)
else:
return id_or_url