# # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # import json import logging import pkgutil import time from abc import ABC from datetime import datetime from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union import jwt import pendulum import requests from airbyte_cdk.models import SyncMode from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.sources.streams.http.auth import Oauth2Authenticator from .custom_reports_validator import CustomReportsValidator DATA_IS_NOT_GOLDEN_MSG = "Google Analytics data is not golden. Future requests may return different data." RESULT_IS_SAMPLED_MSG = ( "Google Analytics data is sampled. Consider using a smaller window_in_days parameter. " "For more info check https://developers.google.com/analytics/devguides/reporting/core/v4/basics#sampling" ) class GoogleAnalyticsV4TypesList(HttpStream): """ Provides functionality to fetch the valid (dimensions, metrics) for the Analytics Reporting API and their data types. """ primary_key = None # Link to query the metadata for available metrics and dimensions. # Those are not provided in the Analytics Reporting API V4. # Column id completely match for v3 and v4. url_base = "https://www.googleapis.com/analytics/v3/metadata/ga/columns" def path(self, **kwargs: Any) -> str: return "" def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: """Abstractmethod HTTPStream CDK dependency""" return None def parse_response(self, response: requests.Response, **kwargs: Any) -> Tuple[dict, dict]: """ Returns a map of (dimensions, metrics) hashes, example: ({"ga:userType": "STRING", "ga:sessionCount": "STRING"}, {"ga:pageviewsPerSession": "FLOAT", "ga:sessions": "INTEGER"}) Each available dimension can be found in dimensions with its data type as the value. e.g. dimensions['ga:userType'] == STRING Each available metric can be found in metrics with its data type as the value. e.g. metrics['ga:sessions'] == INTEGER """ metrics = {} dimensions = {} results = response.json() columns = results.get("items", []) for column in columns: column_attributes = column.get("attributes", []) column_name = column.get("id") column_type = column_attributes.get("type") column_data_type = column_attributes.get("dataType") if column_type == "METRIC": metrics[column_name] = column_data_type elif column_type == "DIMENSION": dimensions[column_name] = column_data_type else: raise Exception(f"Unsupported column type {column_type}.") return dimensions, metrics class GoogleAnalyticsV4Stream(HttpStream, ABC): primary_key = None http_method = "POST" # The Analytics Core Reporting API returns a maximum of 100,000 rows per request. # https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet?hl=en page_size = 100000 url_base = "https://analyticsreporting.googleapis.com/v4/" report_field = "reports" map_type = dict(INTEGER="integer", FLOAT="number", PERCENT="number", TIME="number") def __init__(self, config: MutableMapping): super().__init__(authenticator=config["authenticator"]) self.start_date = config["start_date"] self.window_in_days: int = config.get("window_in_days", 1) self.view_id = config["view_id"] self.metrics = config["metrics"] self.dimensions = config["dimensions"] self.segments = config.get("segments", list()) self.filtersExpression = config.get("filter", "") self._config = config self.dimensions_ref, self.metrics_ref = GoogleAnalyticsV4TypesList().read_records(sync_mode=None) self._raise_on_http_errors: bool = True @property def state_checkpoint_interval(self) -> int: return self.window_in_days @property def availability_strategy(self) -> Optional["AvailabilityStrategy"]: return None @staticmethod def to_datetime_str(date: datetime) -> str: """ Custom method. Returns the formated datetime string. :: Output example: '2021-07-15 07' FORMAT : "%Y-%m-%d" """ return date.strftime("%Y-%m-%d") @staticmethod def to_iso_datetime_str(date: str) -> str: return datetime.strptime(date, "%Y%m%d").strftime("%Y-%m-%d") def path(self, **kwargs: Any) -> str: # need add './' for correct urllib.parse.urljoin work due to path contains ':' return "./reports:batchGet" def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: reports = response.json().get(self.report_field, []) for report in reports: # since we're requesting just one report at a time, the first report in the response is enough next_page = report.get("nextPageToken") if next_page: return {"pageToken": next_page} def should_retry(self, response: requests.Response) -> bool: """ When the connector gets a custom report which has unknown metric(s) or dimension(s) and API returns an error with 400 code, the connector ignores an error with 400 code to finish successfully sync and inform the user about an error in logs with an error message. When the daily request limit reached, the connector ignores an error with 429 code and 'has exceeded the daily request limit' error massage to finish successfully sync and inform the user about an error in logs with an error message and link to google analytics docs. """ if response.status_code == 400: self.logger.info(f"{response.json()['error']['message']}") self._raise_on_http_errors = False return False elif response.status_code == 429 and "has exceeded the daily request limit" in response.json()["error"]["message"]: rate_limit_docs_url = "https://developers.google.com/analytics/devguides/reporting/core/v4/limits-quotas" self.logger.info(f"{response.json()['error']['message']}. More info: {rate_limit_docs_url}") self._raise_on_http_errors = False return False result: bool = HttpStream.should_retry(self, response) return result @property def raise_on_http_errors(self) -> bool: return self._raise_on_http_errors def request_body_json( self, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, **kwargs: Any ) -> Optional[Mapping]: metrics = [{"expression": metric} for metric in self.metrics] dimensions = [{"name": dimension} for dimension in self.dimensions] segments = [{"segmentId": segment} for segment in self.segments] filtersExpression = self.filtersExpression request_body = { "reportRequests": [ { "viewId": self.view_id, "dateRanges": [stream_slice], "pageSize": self.page_size, "metrics": metrics, "dimensions": dimensions, "segments": segments, "filtersExpression": filtersExpression, } ] } if next_page_token: request_body["reportRequests"][0].update(next_page_token) return request_body def get_json_schema(self) -> Mapping[str, Any]: """ Override get_json_schema CDK method to retrieve the schema information for GoogleAnalyticsV4 Object dynamically. """ schema: Dict[str, Any] = { "$schema": "http://json-schema.org/draft-07/schema#", "type": ["null", "object"], "additionalProperties": True, "properties": { "view_id": {"type": ["string"]}, }, } # Add the dimensions to the schema for dimension in self.dimensions: data_type = self.lookup_data_type("dimension", dimension) data_format = self.lookup_data_format(dimension) dimension = dimension.replace("ga:", "ga_") dimension_data: Dict[str, Any] = {"type": [data_type]} if data_format: dimension_data["format"] = data_format schema["properties"][dimension] = dimension_data # Add the metrics to the schema for metric in self.metrics: data_type = self.lookup_data_type("metric", metric) data_format = self.lookup_data_format(metric) metric = metric.replace("ga:", "ga_") # metrics are allowed to also have null values metric_data: Dict[str, Any] = {"type": ["null", data_type]} if data_format: metric_data["format"] = data_format schema["properties"][metric] = metric_data schema["properties"]["isDataGolden"] = {"type": "boolean"} return schema def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]: """ Override default stream_slices CDK method to provide date_slices as page chunks for data fetch. Returns list of dict, example: [{ "startDate": "2020-01-01", "endDate": "2021-01-02" }, { "startDate": "2020-01-03", "endDate": "2021-01-04" }, ...] """ end_date = pendulum.now().date() start_date = pendulum.parse(self.start_date).date() if stream_state: prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date() start_date = prev_end_date.add(days=1) # do not include previous `end_date` # always resync 2 previous days to be sure data is golden # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503 start_date = start_date.subtract(days=2) date_slices = [] slice_start_date = start_date while slice_start_date <= end_date: slice_end_date = slice_start_date.add(days=self.window_in_days) # limit the slice range with end_date slice_end_date = min(slice_end_date, end_date) date_slices.append({"startDate": self.to_datetime_str(slice_start_date), "endDate": self.to_datetime_str(slice_end_date)}) # start next slice 1 day after previous slice ended to prevent duplicate reads slice_start_date = slice_end_date.add(days=1) return date_slices or [None] @staticmethod def report_rows(report_body: MutableMapping[Any, Any]) -> List[MutableMapping[Any, Any]]: return report_body.get("data", {}).get("rows", []) def lookup_data_type(self, field_type: str, attribute: str) -> str: """ Get the data type of a metric or a dimension """ try: if field_type == "dimension": if attribute.startswith(("ga:dimension", "ga:customVarName", "ga:customVarValue", "ga:segment")): # Custom Google Analytics Dimensions that are not part of self.dimensions_ref. They are always # strings return "string" elif attribute.startswith("ga:dateHourMinute"): return "integer" attr_type = self.dimensions_ref[attribute] elif field_type == "metric": # Custom Google Analytics Metrics {ga:goalXXStarts, ga:metricXX, ... } # We always treat them as strings as we can not be sure of their data type if attribute.startswith("ga:goal") and attribute.endswith( ("Starts", "Completions", "Value", "ConversionRate", "Abandons", "AbandonRate") ): return "string" elif attribute.startswith("ga:searchGoal") and attribute.endswith("ConversionRate"): # Custom Google Analytics Metrics ga:searchGoalXXConversionRate return "string" elif attribute.startswith(("ga:metric", "ga:calcMetric")): return "string" attr_type = self.metrics_ref[attribute] else: attr_type = None self.logger.error(f"Unsupported GA type: {field_type}") except KeyError: attr_type = None self.logger.error(f"Unsupported GA {field_type}: {attribute}") return self.map_type.get(attr_type, "string") @staticmethod def lookup_data_format(attribute: str) -> Union[str, None]: if attribute == "ga:date": return "date" def convert_to_type(self, header: str, value: Any, data_type: str) -> Any: if data_type == "integer": return int(value) if data_type == "number": return float(value) if header == "ga:date": return self.to_iso_datetime_str(value) return value def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable[Mapping]: """ Default response: { "reports": [ { "columnHeader": { "metricHeader": { "metricHeaderEntries": [ { "name": "ga:users", "type": "INTEGER" } ] } }, "data": { "isDataGolden": true, "maximums": [ { "values": [ "98" ] } ], "minimums": [ { "values": [ "98" ] } ], "rowCount": 1, "rows": [ { "metrics": [ { "values": [ "98" ] } ] } ], "totals": [ { "values": [ "98" ] } ] } } ] } Return record which is a map of metric and dimension names and values, like: record = { "view_id":"1111111" "ga_date":"20210212", "ga_users":3, "ga_newUsers":2, "ga_sessions":7, "ga_sessionsPerUser":8.0, "ga_avgSessionDuration":201.0, "ga_pageviews":43, "ga_pageviewsPerSession":12.5, "ga_avgTimeOnPage":83.14035087719298, "ga_bounceRate":0.0, "ga_exitRate":6.523809523809524 } """ json_response = response.json() if response.status_code not in (400, 429) else None if not json_response: return [] reports = json_response.get(self.report_field, []) for report in reports: column_header = report.get("columnHeader", {}) dimension_headers = column_header.get("dimensions", []) metric_headers = column_header.get("metricHeader", {}).get("metricHeaderEntries", []) self.check_for_sampled_result(report.get("data", {})) for row in self.report_rows(report): record = {} dimensions = row.get("dimensions", []) metrics = row.get("metrics", []) for header, dimension in zip(dimension_headers, dimensions): data_type = self.lookup_data_type("dimension", header) value = self.convert_to_type(header, dimension, data_type) record[header.replace("ga:", "ga_")] = value for i, values in enumerate(metrics): for metric_header, value in zip(metric_headers, values.get("values")): metric_name = metric_header.get("name") metric_type = self.lookup_data_type("metric", metric_name) value = self.convert_to_type(metric_name, value, metric_type) record[metric_name.replace("ga:", "ga_")] = value record["view_id"] = self.view_id record["isDataGolden"] = report.get("data", {}).get("isDataGolden", False) yield record def check_for_sampled_result(self, data: Mapping) -> None: if not data.get("isDataGolden", False): self.logger.warning(DATA_IS_NOT_GOLDEN_MSG) if data.get("samplesReadCounts", False): self.logger.warning(RESULT_IS_SAMPLED_MSG) class GoogleAnalyticsV4IncrementalObjectsBase(GoogleAnalyticsV4Stream): cursor_field = "ga_date" def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]: return {self.cursor_field: max(latest_record.get(self.cursor_field, ""), current_stream_state.get(self.cursor_field, ""))} def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: if not stream_slice: return [] return super().read_records(sync_mode, cursor_field, stream_slice, stream_state) class GoogleAnalyticsServiceOauth2Authenticator(Oauth2Authenticator): """Request example for API token extraction: curl --location --request POST https://oauth2.googleapis.com/token?grant_type=urn:ietf:params:oauth:grant-type:jwt-bearer&assertion=signed_JWT """ def __init__(self, config: Mapping): self.credentials_json = json.loads(config["credentials_json"]) self.client_email = self.credentials_json["client_email"] self.scope = "https://www.googleapis.com/auth/analytics.readonly" super().__init__( token_refresh_endpoint="https://oauth2.googleapis.com/token", client_secret=self.credentials_json["private_key"], client_id=self.credentials_json["private_key_id"], refresh_token=None, ) def refresh_access_token(self) -> Tuple[str, int]: """ Calling the Google OAuth 2.0 token endpoint. Used for authorizing signed JWT. Returns tuple with access token and token's time-to-live """ response_json = None try: response = requests.request(method="POST", url=self.token_refresh_endpoint, params=self.get_refresh_request_params()) response_json = response.json() response.raise_for_status() except requests.exceptions.RequestException as e: if response_json and "error" in response_json: raise Exception( "Error refreshing access token {}. Error: {}; Error details: {}; Exception: {}".format( response_json, response_json["error"], response_json["error_description"], e ) ) from e raise Exception(f"Error refreshing access token: {e}") from e else: return response_json["access_token"], response_json["expires_in"] def get_refresh_request_params(self) -> Mapping[str, Any]: """ Sign the JWT with RSA-256 using the private key found in service account JSON file. """ token_lifetime = 3600 # token lifetime is 1 hour issued_at = time.time() expiration_time = issued_at + token_lifetime payload = { "iss": self.client_email, "sub": self.client_email, "scope": self.scope, "aud": self.token_refresh_endpoint, "iat": issued_at, "exp": expiration_time, } headers = {"kid": self.client_id} signed_jwt = jwt.encode(payload, self.client_secret, headers=headers, algorithm="RS256") return {"grant_type": "urn:ietf:params:oauth:grant-type:jwt-bearer", "assertion": str(signed_jwt)} class TestStreamConnection(GoogleAnalyticsV4Stream): """ Test the connectivity and permissions to read the data from the stream. Because of the nature of the connector, the streams are created dynamicaly. We declare the static stream like this to be able to test out the prmissions to read the particular view_id.""" page_size = 1 def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: """For test reading pagination is not required""" return None def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]: """ Override this method to fetch records from start_date up to now for testing case """ start_date = pendulum.parse(self.start_date).date() end_date = pendulum.now().date() return [{"startDate": self.to_datetime_str(start_date), "endDate": self.to_datetime_str(end_date)}] def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable[Mapping]: res = response.json() return res.get("reports", {})[0].get("data") class SourceGoogleAnalyticsV4(AbstractSource): """Google Analytics lets you analyze data about customer engagement with your website or application.""" @staticmethod def get_authenticator(config: Mapping) -> Oauth2Authenticator: # backwards compatibility, credentials_json used to be in the top level of the connector if config.get("credentials_json"): return GoogleAnalyticsServiceOauth2Authenticator(config) auth_params = config["credentials"] if auth_params["auth_type"] == "Service" or auth_params.get("credentials_json"): return GoogleAnalyticsServiceOauth2Authenticator(auth_params) else: return Oauth2Authenticator( token_refresh_endpoint="https://oauth2.googleapis.com/token", client_secret=auth_params["client_secret"], client_id=auth_params["client_id"], refresh_token=auth_params["refresh_token"], scopes=["https://www.googleapis.com/auth/analytics.readonly"], ) def check_connection(self, logger: logging.Logger, config: MutableMapping) -> Tuple[bool, Any]: # declare additional variables authenticator = self.get_authenticator(config) config["authenticator"] = authenticator config["metrics"] = ["ga:hits"] config["dimensions"] = ["ga:date"] # load and verify the custom_reports try: # test the eligibility of custom_reports input custom_reports = config.get("custom_reports") if custom_reports: CustomReportsValidator(json.loads(custom_reports)).validate() # Read records to check the reading permissions read_check = list(TestStreamConnection(config).read_records(sync_mode=None)) if read_check: return True, None return ( False, f"Please check the permissions for the requested view_id: {config['view_id']}. Cannot retrieve data from that view ID.", ) except ValueError as e: return False, f"Invalid custom reports json structure. {e}" except requests.exceptions.RequestException as e: error_msg = e.response.json().get("error") if e.response.status_code == 403: return False, f"Please check the permissions for the requested view_id: {config['view_id']}. {error_msg}" else: return False, f"{error_msg}" def streams(self, config: MutableMapping[str, Any]) -> List[Stream]: streams: List[GoogleAnalyticsV4Stream] = [] authenticator = self.get_authenticator(config) config["authenticator"] = authenticator reports = json.loads(pkgutil.get_data("source_google_analytics_v4", "defaults/default_reports.json")) custom_reports = config.get("custom_reports") if custom_reports: custom_reports = json.loads(custom_reports) custom_reports = [custom_reports] if not isinstance(custom_reports, list) else custom_reports reports += custom_reports config["ga_streams"] = reports for stream in config["ga_streams"]: config["metrics"] = stream["metrics"] config["dimensions"] = stream["dimensions"] config["segments"] = stream.get("segments", list()) config["filter"] = stream.get("filter", "") # construct GAReadStreams sub-class for each stream stream_name = stream["name"] stream_bases = (GoogleAnalyticsV4Stream,) if "ga:date" in stream["dimensions"]: stream_bases = (GoogleAnalyticsV4IncrementalObjectsBase,) stream_class = type(stream_name, stream_bases, {}) # instantiate a stream with config stream_instance = stream_class(config) streams.append(stream_instance) return streams