1
0
mirror of synced 2025-12-25 02:09:19 -05:00

Source Google Sheets: removed row_batch_size from spec (#29826)

This commit is contained in:
Daryna Ishchenko
2023-08-25 22:02:51 +03:00
committed by GitHub
parent 3cea2d7d25
commit 45c90dd2b8
9 changed files with 48 additions and 33 deletions

View File

@@ -36,5 +36,5 @@ COPY source_google_sheets ./source_google_sheets
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
LABEL io.airbyte.version=0.3.6
LABEL io.airbyte.version=0.3.7
LABEL io.airbyte.name=airbyte/source-google-sheets

View File

@@ -28,4 +28,6 @@ acceptance_tests:
spec:
tests:
- spec_path: source_google_sheets/spec.yaml
backward_compatibility_tests_config:
disable_for_version: "0.3.6"

View File

@@ -5,7 +5,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
dockerImageTag: 0.3.6
dockerImageTag: 0.3.7
dockerRepository: airbyte/source-google-sheets
githubIssueLabel: source-google-sheets
icon: google-sheets.svg

View File

@@ -2,7 +2,7 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import logging
from typing import Dict, List
import backoff
@@ -11,29 +11,44 @@ from requests import codes as status_codes
from .helpers import SCOPES, Helpers
def give_up(error):
code = error.resp.status
# Stop retrying if it's not a problem with the rate limit or on the server end
return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
logger = logging.getLogger("airbyte")
class GoogleSheetsClient:
class Backoff:
row_batch_size = 200
@classmethod
def increase_row_batch_size(cls, details):
if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
cls.row_batch_size = cls.row_batch_size + 10
logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
@staticmethod
def give_up(error):
code = error.resp.status
# Stop retrying if it's not a problem with the rate limit or on the server end
return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
def create_range(self, sheet, row_cursor):
range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
return range
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
def get(self, **kwargs):
return self.client.get(**kwargs).execute()
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
def create(self, **kwargs):
return self.client.create(**kwargs).execute()
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
def get_values(self, **kwargs):
return self.client.values().batchGet(**kwargs).execute()
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
@backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
def update_values(self, **kwargs):
return self.client.values().batchUpdate(**kwargs).execute()

View File

@@ -30,8 +30,6 @@ from .models.spreadsheet import Spreadsheet
from .models.spreadsheet_values import SpreadsheetValues
from .utils import exception_description_by_status_code, safe_name_conversion
# set default batch read size
ROW_BATCH_SIZE = 200
# override default socket timeout to be 10 mins instead of 60 sec.
# on behalf of https://github.com/airbytehq/oncall/issues/242
DEFAULT_SOCKET_TIMEOUT: int = 600
@@ -154,7 +152,6 @@ class SourceGoogleSheets(Source):
sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
row_batch_size = config.get("row_batch_size", ROW_BATCH_SIZE)
logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
# For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
# a blank row, emit the row batch
@@ -174,13 +171,13 @@ class SourceGoogleSheets(Source):
# if the last row of the interval goes outside the sheet - this is normal, we will return
# only the real data of the sheet and in the next iteration we will loop out.
while row_cursor <= sheet_row_counts[sheet]:
range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}"
range = client.create_range(sheet, row_cursor)
logger.info(f"Fetching range {range}")
row_batch = SpreadsheetValues.parse_obj(
client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")
)
row_cursor += row_batch_size + 1
row_cursor += client.Backoff.row_batch_size + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]

View File

@@ -15,15 +15,6 @@ connectionSpecification:
Enter the link to the Google spreadsheet you want to sync. To copy the link, click the 'Share' button in the top-right corner of the spreadsheet, then click 'Copy link'.
examples:
- https://docs.google.com/spreadsheets/d/1hLd9Qqti3UyLXZB2aFfUWDT7BG-arw2xy4HR3D-dwUb/edit
row_batch_size:
type: integer
title: Row Batch Size
description: The number of rows fetched when making a Google Sheet API call. Defaults to 200. You can increase this value to avoid rate limits if your data is particularly wide.
default: 200
examples:
- 50
- 100
- 200
names_conversion:
type: boolean
title: Convert Column Names to SQL-Compliant Format

View File

@@ -4,7 +4,7 @@
import pytest
import requests
from source_google_sheets.client import give_up
from source_google_sheets.client import GoogleSheetsClient
@pytest.mark.parametrize(
@@ -13,7 +13,18 @@ from source_google_sheets.client import give_up
(429, False), (500, False), (404, True)
]
)
def test_give_up(status, need_give_up, mocker):
def test_backoff_give_up(status, need_give_up, mocker):
e = requests.HTTPError('error')
e.resp = mocker.Mock(status=status)
assert need_give_up is give_up(e)
assert need_give_up is GoogleSheetsClient.Backoff.give_up(e)
def test_backoff_increase_row_batch_size(mocker):
assert GoogleSheetsClient.Backoff.row_batch_size == 200
e = requests.HTTPError('error')
e.status_code = 429
GoogleSheetsClient.Backoff.increase_row_batch_size({"exception": e})
assert GoogleSheetsClient.Backoff.row_batch_size == 210
GoogleSheetsClient.Backoff.row_batch_size = 1000
GoogleSheetsClient.Backoff.increase_row_batch_size({"exception": e})
assert GoogleSheetsClient.Backoff.row_batch_size == 1000

View File

@@ -36,8 +36,7 @@ For detailed instructions on how to generate a service account key or OAuth cred
3. For **Spreadsheet Link**, enter the link to the Google spreadsheet. To get the link, go to the Google spreadsheet you want to sync, click **Share** in the top right corner, and click **Copy Link**.
4. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
5. (Optional) For **Row Batch Size**, you may specify the number of records you want to fetch per request to the Google API. By adjusting this value, you can balance the efficiency of the data retrieval process with [Google's request quotas](#performance-consideration). The default value of 200 should suffice for most use cases.
6. Click **Set up source** and wait for the tests to complete.
5. Click **Set up source** and wait for the tests to complete.
### Output schema

View File

@@ -95,8 +95,7 @@ To set up Google Sheets as a source in Airbyte Cloud:
6. For **Spreadsheet Link**, enter the link to the Google spreadsheet. To get the link, go to the Google spreadsheet you want to sync, click **Share** in the top right corner, and click **Copy Link**.
7. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
8. (Optional) For **Row Batch Size**, you may specify the number of records you want to fetch per request to the Google API. By adjusting this value, you can balance the efficiency of the data retrieval process with [Google's request quotas](#performance-consideration). The default value of 200 should suffice for most use cases.
9. Click **Set up source** and wait for the tests to complete.
8. Click **Set up source** and wait for the tests to complete.
### Output schema
@@ -133,6 +132,7 @@ Airbyte batches requests to the API in order to efficiently pull data and respec
| Version | Date | Pull Request | Subject |
|---------|------------|----------------------------------------------------------|-----------------------------------------------------------------------------------|
| 0.3.7 | 2023-08-25 | [29826](https://github.com/airbytehq/airbyte/pull/29826) | Remove row batch size from spec, add auto increase this value when rate limits |
| 0.3.6 | 2023-08-16 | [29491](https://github.com/airbytehq/airbyte/pull/29491) | Update to latest CDK |
| 0.3.5 | 2023-08-16 | [29427](https://github.com/airbytehq/airbyte/pull/29427) | Add stop reading in case of 429 error |
| 0.3.4 | 2023-05-15 | [29453](https://github.com/airbytehq/airbyte/pull/29453) | Update spec descriptions |