✨ Source Google Sheets: removed row_batch_size from spec (#29826)

2025-12-25 02:09:19 -05:00 · 2023-08-25 22:02:51 +03:00
parent 3cea2d7d25
commit 45c90dd2b8
9 changed files with 48 additions and 33 deletions
--- a/airbyte-integrations/connectors/source-google-sheets/Dockerfile
+++ b/airbyte-integrations/connectors/source-google-sheets/Dockerfile
@@ -36,5 +36,5 @@ COPY source_google_sheets ./source_google_sheets
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

-LABEL io.airbyte.version=0.3.6
+LABEL io.airbyte.version=0.3.7
 LABEL io.airbyte.name=airbyte/source-google-sheets
--- a/airbyte-integrations/connectors/source-google-sheets/acceptance-test-config.yml
+++ b/airbyte-integrations/connectors/source-google-sheets/acceptance-test-config.yml
@@ -28,4 +28,6 @@ acceptance_tests:
  spec:
    tests:
    - spec_path: source_google_sheets/spec.yaml
+      backward_compatibility_tests_config:
+        disable_for_version: "0.3.6"

--- a/airbyte-integrations/connectors/source-google-sheets/metadata.yaml
+++ b/airbyte-integrations/connectors/source-google-sheets/metadata.yaml
@@ -5,7 +5,7 @@ data:
  connectorSubtype: file
  connectorType: source
  definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
-  dockerImageTag: 0.3.6
+  dockerImageTag: 0.3.7
  dockerRepository: airbyte/source-google-sheets
  githubIssueLabel: source-google-sheets
  icon: google-sheets.svg
--- a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/client.py
+++ b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/client.py
@@ -2,7 +2,7 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #

-
+import logging
 from typing import Dict, List

 import backoff
@@ -11,29 +11,44 @@ from requests import codes as status_codes

 from .helpers import SCOPES, Helpers

-
-def give_up(error):
-    code = error.resp.status
-    # Stop retrying if it's not a problem with the rate limit or on the server end
-    return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
+logger = logging.getLogger("airbyte")


 class GoogleSheetsClient:
+    class Backoff:
+        row_batch_size = 200
+
+        @classmethod
+        def increase_row_batch_size(cls, details):
+            if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
+                cls.row_batch_size = cls.row_batch_size + 10
+                logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
+
+        @staticmethod
+        def give_up(error):
+            code = error.resp.status
+            # Stop retrying if it's not a problem with the rate limit or on the server end
+            return not (code == status_codes.TOO_MANY_REQUESTS or 500 <= code < 600)
+
    def __init__(self, credentials: Dict[str, str], scopes: List[str] = SCOPES):
        self.client = Helpers.get_authenticated_sheets_client(credentials, scopes)

-    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
+    def create_range(self, sheet, row_cursor):
+        range = f"{sheet}!{row_cursor}:{row_cursor + self.Backoff.row_batch_size}"
+        return range
+
+    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
    def get(self, **kwargs):
        return self.client.get(**kwargs).execute()

-    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
+    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
    def create(self, **kwargs):
        return self.client.create(**kwargs).execute()

-    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
+    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
    def get_values(self, **kwargs):
        return self.client.values().batchGet(**kwargs).execute()

-    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=give_up)
+    @backoff.on_exception(backoff.expo, errors.HttpError, max_time=120, giveup=Backoff.give_up, on_backoff=Backoff.increase_row_batch_size)
    def update_values(self, **kwargs):
        return self.client.values().batchUpdate(**kwargs).execute()
--- a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py
+++ b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py
@@ -30,8 +30,6 @@ from .models.spreadsheet import Spreadsheet
 from .models.spreadsheet_values import SpreadsheetValues
 from .utils import exception_description_by_status_code, safe_name_conversion

-# set default batch read size
-ROW_BATCH_SIZE = 200
 # override default socket timeout to be 10 mins instead of 60 sec.
 # on behalf of https://github.com/airbytehq/oncall/issues/242
 DEFAULT_SOCKET_TIMEOUT: int = 600
@@ -154,7 +152,6 @@ class SourceGoogleSheets(Source):
        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])

-        row_batch_size = config.get("row_batch_size", ROW_BATCH_SIZE)
        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
@@ -174,13 +171,13 @@ class SourceGoogleSheets(Source):
                # if the last row of the interval goes outside the sheet - this is normal, we will return
                # only the real data of the sheet and in the next iteration we will loop out.
                while row_cursor <= sheet_row_counts[sheet]:
-                    range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}"
+                    range = client.create_range(sheet, row_cursor)
                    logger.info(f"Fetching range {range}")
                    row_batch = SpreadsheetValues.parse_obj(
                        client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")
                    )

-                    row_cursor += row_batch_size + 1
+                    row_cursor += client.Backoff.row_batch_size + 1
                    # there should always be one range since we requested only one
                    value_ranges = row_batch.valueRanges[0]

--- a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/spec.yaml
+++ b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/spec.yaml
@@ -15,15 +15,6 @@ connectionSpecification:
        Enter the link to the Google spreadsheet you want to sync. To copy the link, click the 'Share' button in the top-right corner of the spreadsheet, then click 'Copy link'.
      examples:
        - https://docs.google.com/spreadsheets/d/1hLd9Qqti3UyLXZB2aFfUWDT7BG-arw2xy4HR3D-dwUb/edit
-    row_batch_size:
-      type: integer
-      title: Row Batch Size
-      description: The number of rows fetched when making a Google Sheet API call. Defaults to 200. You can increase this value to avoid rate limits if your data is particularly wide.
-      default: 200
-      examples:
-        - 50
-        - 100
-        - 200
    names_conversion:
      type: boolean
      title: Convert Column Names to SQL-Compliant Format
--- a/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_client.py
+++ b/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_client.py
@@ -4,7 +4,7 @@

 import pytest
 import requests
-from source_google_sheets.client import give_up
+from source_google_sheets.client import GoogleSheetsClient


@pytest.mark.parametrize(
@@ -13,7 +13,18 @@ from source_google_sheets.client import give_up
        (429, False), (500, False), (404, True)
    ]
 )
-def test_give_up(status, need_give_up, mocker):
+def test_backoff_give_up(status, need_give_up, mocker):
    e = requests.HTTPError('error')
    e.resp = mocker.Mock(status=status)
-    assert need_give_up is give_up(e)
+    assert need_give_up is GoogleSheetsClient.Backoff.give_up(e)
+
+
+def test_backoff_increase_row_batch_size(mocker):
+    assert GoogleSheetsClient.Backoff.row_batch_size == 200
+    e = requests.HTTPError('error')
+    e.status_code = 429
+    GoogleSheetsClient.Backoff.increase_row_batch_size({"exception": e})
+    assert GoogleSheetsClient.Backoff.row_batch_size == 210
+    GoogleSheetsClient.Backoff.row_batch_size = 1000
+    GoogleSheetsClient.Backoff.increase_row_batch_size({"exception": e})
+    assert GoogleSheetsClient.Backoff.row_batch_size == 1000
--- a/docs/integrations/sources/google-sheets.inapp.md
+++ b/docs/integrations/sources/google-sheets.inapp.md
@@ -36,8 +36,7 @@ For detailed instructions on how to generate a service account key or OAuth cred

 3. For **Spreadsheet Link**, enter the link to the Google spreadsheet. To get the link, go to the Google spreadsheet you want to sync, click **Share** in the top right corner, and click **Copy Link**.
 4. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
-5. (Optional) For **Row Batch Size**, you may specify the number of records you want to fetch per request to the Google API. By adjusting this value, you can balance the efficiency of the data retrieval process with [Google's request quotas](#performance-consideration). The default value of 200 should suffice for most use cases.
-6. Click **Set up source** and wait for the tests to complete.
+5. Click **Set up source** and wait for the tests to complete.

 ### Output schema

--- a/docs/integrations/sources/google-sheets.md
+++ b/docs/integrations/sources/google-sheets.md
@@ -95,8 +95,7 @@ To set up Google Sheets as a source in Airbyte Cloud:

 6. For **Spreadsheet Link**, enter the link to the Google spreadsheet. To get the link, go to the Google spreadsheet you want to sync, click **Share** in the top right corner, and click **Copy Link**.
 7. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
-8. (Optional) For **Row Batch Size**, you may specify the number of records you want to fetch per request to the Google API. By adjusting this value, you can balance the efficiency of the data retrieval process with [Google's request quotas](#performance-consideration). The default value of 200 should suffice for most use cases.
-9. Click **Set up source** and wait for the tests to complete.
+8. Click **Set up source** and wait for the tests to complete.

 ### Output schema

@@ -133,6 +132,7 @@ Airbyte batches requests to the API in order to efficiently pull data and respec

 | Version | Date       | Pull Request                                             | Subject                                                                           |
 |---------|------------|----------------------------------------------------------|-----------------------------------------------------------------------------------|
+| 0.3.7   | 2023-08-25 | [29826](https://github.com/airbytehq/airbyte/pull/29826) | Remove row batch size from spec, add auto increase this value when rate limits    |
 | 0.3.6   | 2023-08-16 | [29491](https://github.com/airbytehq/airbyte/pull/29491) | Update to latest CDK                                                              |
 | 0.3.5   | 2023-08-16 | [29427](https://github.com/airbytehq/airbyte/pull/29427) | Add stop reading in case of 429 error                                             |
 | 0.3.4   | 2023-05-15 | [29453](https://github.com/airbytehq/airbyte/pull/29453) | Update spec descriptions                                                          |