1
0
mirror of synced 2026-01-01 00:02:54 -05:00
Files
Cole Snodgrass 2e099acc52 update headers from 2022 -> 2023 (#22594)
* It's 2023!

* 2022 -> 2023

---------

Co-authored-by: evantahler <evan@airbyte.io>
2023-02-08 13:01:16 -08:00

646 lines
26 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import logging
import pkgutil
import time
from abc import ABC
from datetime import datetime
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
import jwt
import pendulum
import requests
from airbyte_cdk.models import SyncMode
from airbyte_cdk.sources import AbstractSource
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
from airbyte_cdk.sources.streams.http import HttpStream
from airbyte_cdk.sources.streams.http.auth import Oauth2Authenticator
from .custom_reports_validator import CustomReportsValidator
DATA_IS_NOT_GOLDEN_MSG = "Google Analytics data is not golden. Future requests may return different data."
RESULT_IS_SAMPLED_MSG = (
"Google Analytics data is sampled. Consider using a smaller window_in_days parameter. "
"For more info check https://developers.google.com/analytics/devguides/reporting/core/v4/basics#sampling"
)
class GoogleAnalyticsV4TypesList(HttpStream):
"""
Provides functionality to fetch the valid (dimensions, metrics) for the Analytics Reporting API and their data
types.
"""
primary_key = None
# Link to query the metadata for available metrics and dimensions.
# Those are not provided in the Analytics Reporting API V4.
# Column id completely match for v3 and v4.
url_base = "https://www.googleapis.com/analytics/v3/metadata/ga/columns"
def path(self, **kwargs: Any) -> str:
return ""
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
"""Abstractmethod HTTPStream CDK dependency"""
return None
def parse_response(self, response: requests.Response, **kwargs: Any) -> Tuple[dict, dict]:
"""
Returns a map of (dimensions, metrics) hashes, example:
({"ga:userType": "STRING", "ga:sessionCount": "STRING"}, {"ga:pageviewsPerSession": "FLOAT", "ga:sessions": "INTEGER"})
Each available dimension can be found in dimensions with its data type
as the value. e.g. dimensions['ga:userType'] == STRING
Each available metric can be found in metrics with its data type
as the value. e.g. metrics['ga:sessions'] == INTEGER
"""
metrics = {}
dimensions = {}
results = response.json()
columns = results.get("items", [])
for column in columns:
column_attributes = column.get("attributes", [])
column_name = column.get("id")
column_type = column_attributes.get("type")
column_data_type = column_attributes.get("dataType")
if column_type == "METRIC":
metrics[column_name] = column_data_type
elif column_type == "DIMENSION":
dimensions[column_name] = column_data_type
else:
raise Exception(f"Unsupported column type {column_type}.")
return dimensions, metrics
class GoogleAnalyticsV4Stream(HttpStream, ABC):
primary_key = None
http_method = "POST"
# The Analytics Core Reporting API returns a maximum of 100,000 rows per request.
# https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet?hl=en
page_size = 100000
url_base = "https://analyticsreporting.googleapis.com/v4/"
report_field = "reports"
map_type = dict(INTEGER="integer", FLOAT="number", PERCENT="number", TIME="number")
def __init__(self, config: MutableMapping):
super().__init__(authenticator=config["authenticator"])
self.start_date = config["start_date"]
self.window_in_days: int = config.get("window_in_days", 1)
self.view_id = config["view_id"]
self.metrics = config["metrics"]
self.dimensions = config["dimensions"]
self.segments = config.get("segments", list())
self.filtersExpression = config.get("filter", "")
self._config = config
self.dimensions_ref, self.metrics_ref = GoogleAnalyticsV4TypesList().read_records(sync_mode=None)
self._raise_on_http_errors: bool = True
@property
def state_checkpoint_interval(self) -> int:
return self.window_in_days
@property
def availability_strategy(self) -> Optional["AvailabilityStrategy"]:
return None
@staticmethod
def to_datetime_str(date: datetime) -> str:
"""
Custom method.
Returns the formated datetime string.
:: Output example: '2021-07-15 07' FORMAT : "%Y-%m-%d"
"""
return date.strftime("%Y-%m-%d")
@staticmethod
def to_iso_datetime_str(date: str) -> str:
return datetime.strptime(date, "%Y%m%d").strftime("%Y-%m-%d")
def path(self, **kwargs: Any) -> str:
# need add './' for correct urllib.parse.urljoin work due to path contains ':'
return "./reports:batchGet"
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
reports = response.json().get(self.report_field, [])
for report in reports:
# since we're requesting just one report at a time, the first report in the response is enough
next_page = report.get("nextPageToken")
if next_page:
return {"pageToken": next_page}
def should_retry(self, response: requests.Response) -> bool:
"""
When the connector gets a custom report which has unknown metric(s) or dimension(s)
and API returns an error with 400 code, the connector ignores an error with 400 code
to finish successfully sync and inform the user about an error in logs with an error message.
When the daily request limit reached, the connector ignores an error with 429 code and
'has exceeded the daily request limit' error massage to finish successfully sync and
inform the user about an error in logs with an error message and link to google analytics docs.
"""
if response.status_code == 400:
self.logger.info(f"{response.json()['error']['message']}")
self._raise_on_http_errors = False
return False
elif response.status_code == 429 and "has exceeded the daily request limit" in response.json()["error"]["message"]:
rate_limit_docs_url = "https://developers.google.com/analytics/devguides/reporting/core/v4/limits-quotas"
self.logger.info(f"{response.json()['error']['message']}. More info: {rate_limit_docs_url}")
self._raise_on_http_errors = False
return False
result: bool = HttpStream.should_retry(self, response)
return result
@property
def raise_on_http_errors(self) -> bool:
return self._raise_on_http_errors
def request_body_json(
self, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, **kwargs: Any
) -> Optional[Mapping]:
metrics = [{"expression": metric} for metric in self.metrics]
dimensions = [{"name": dimension} for dimension in self.dimensions]
segments = [{"segmentId": segment} for segment in self.segments]
filtersExpression = self.filtersExpression
request_body = {
"reportRequests": [
{
"viewId": self.view_id,
"dateRanges": [stream_slice],
"pageSize": self.page_size,
"metrics": metrics,
"dimensions": dimensions,
"segments": segments,
"filtersExpression": filtersExpression,
}
]
}
if next_page_token:
request_body["reportRequests"][0].update(next_page_token)
return request_body
def get_json_schema(self) -> Mapping[str, Any]:
"""
Override get_json_schema CDK method to retrieve the schema information for GoogleAnalyticsV4 Object dynamically.
"""
schema: Dict[str, Any] = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": ["null", "object"],
"additionalProperties": True,
"properties": {
"view_id": {"type": ["string"]},
},
}
# Add the dimensions to the schema
for dimension in self.dimensions:
data_type = self.lookup_data_type("dimension", dimension)
data_format = self.lookup_data_format(dimension)
dimension = dimension.replace("ga:", "ga_")
dimension_data: Dict[str, Any] = {"type": [data_type]}
if data_format:
dimension_data["format"] = data_format
schema["properties"][dimension] = dimension_data
# Add the metrics to the schema
for metric in self.metrics:
data_type = self.lookup_data_type("metric", metric)
data_format = self.lookup_data_format(metric)
metric = metric.replace("ga:", "ga_")
# metrics are allowed to also have null values
metric_data: Dict[str, Any] = {"type": ["null", data_type]}
if data_format:
metric_data["format"] = data_format
schema["properties"][metric] = metric_data
schema["properties"]["isDataGolden"] = {"type": "boolean"}
return schema
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]:
"""
Override default stream_slices CDK method to provide date_slices as page chunks for data fetch.
Returns list of dict, example: [{
"startDate": "2020-01-01",
"endDate": "2021-01-02"
},
{
"startDate": "2020-01-03",
"endDate": "2021-01-04"
},
...]
"""
end_date = pendulum.now().date()
start_date = pendulum.parse(self.start_date).date()
if stream_state:
prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date()
start_date = prev_end_date.add(days=1) # do not include previous `end_date`
# always resync 2 previous days to be sure data is golden
# https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article
# https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503
start_date = start_date.subtract(days=2)
date_slices = []
slice_start_date = start_date
while slice_start_date <= end_date:
slice_end_date = slice_start_date.add(days=self.window_in_days)
# limit the slice range with end_date
slice_end_date = min(slice_end_date, end_date)
date_slices.append({"startDate": self.to_datetime_str(slice_start_date), "endDate": self.to_datetime_str(slice_end_date)})
# start next slice 1 day after previous slice ended to prevent duplicate reads
slice_start_date = slice_end_date.add(days=1)
return date_slices or [None]
@staticmethod
def report_rows(report_body: MutableMapping[Any, Any]) -> List[MutableMapping[Any, Any]]:
return report_body.get("data", {}).get("rows", [])
def lookup_data_type(self, field_type: str, attribute: str) -> str:
"""
Get the data type of a metric or a dimension
"""
try:
if field_type == "dimension":
if attribute.startswith(("ga:dimension", "ga:customVarName", "ga:customVarValue", "ga:segment")):
# Custom Google Analytics Dimensions that are not part of self.dimensions_ref. They are always
# strings
return "string"
elif attribute.startswith("ga:dateHourMinute"):
return "integer"
attr_type = self.dimensions_ref[attribute]
elif field_type == "metric":
# Custom Google Analytics Metrics {ga:goalXXStarts, ga:metricXX, ... }
# We always treat them as strings as we can not be sure of their data type
if attribute.startswith("ga:goal") and attribute.endswith(
("Starts", "Completions", "Value", "ConversionRate", "Abandons", "AbandonRate")
):
return "string"
elif attribute.startswith("ga:searchGoal") and attribute.endswith("ConversionRate"):
# Custom Google Analytics Metrics ga:searchGoalXXConversionRate
return "string"
elif attribute.startswith(("ga:metric", "ga:calcMetric")):
return "string"
attr_type = self.metrics_ref[attribute]
else:
attr_type = None
self.logger.error(f"Unsupported GA type: {field_type}")
except KeyError:
attr_type = None
self.logger.error(f"Unsupported GA {field_type}: {attribute}")
return self.map_type.get(attr_type, "string")
@staticmethod
def lookup_data_format(attribute: str) -> Union[str, None]:
if attribute == "ga:date":
return "date"
def convert_to_type(self, header: str, value: Any, data_type: str) -> Any:
if data_type == "integer":
return int(value)
if data_type == "number":
return float(value)
if header == "ga:date":
return self.to_iso_datetime_str(value)
return value
def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable[Mapping]:
"""
Default response:
{
"reports": [
{
"columnHeader": {
"metricHeader": {
"metricHeaderEntries": [
{
"name": "ga:users",
"type": "INTEGER"
}
]
}
},
"data": {
"isDataGolden": true,
"maximums": [
{
"values": [
"98"
]
}
],
"minimums": [
{
"values": [
"98"
]
}
],
"rowCount": 1,
"rows": [
{
"metrics": [
{
"values": [
"98"
]
}
]
}
],
"totals": [
{
"values": [
"98"
]
}
]
}
}
]
}
Return record which is a map of metric and dimension names and values, like:
record = {
"view_id":"1111111"
"ga_date":"20210212",
"ga_users":3,
"ga_newUsers":2,
"ga_sessions":7,
"ga_sessionsPerUser":8.0,
"ga_avgSessionDuration":201.0,
"ga_pageviews":43,
"ga_pageviewsPerSession":12.5,
"ga_avgTimeOnPage":83.14035087719298,
"ga_bounceRate":0.0,
"ga_exitRate":6.523809523809524
}
"""
json_response = response.json() if response.status_code not in (400, 429) else None
if not json_response:
return []
reports = json_response.get(self.report_field, [])
for report in reports:
column_header = report.get("columnHeader", {})
dimension_headers = column_header.get("dimensions", [])
metric_headers = column_header.get("metricHeader", {}).get("metricHeaderEntries", [])
self.check_for_sampled_result(report.get("data", {}))
for row in self.report_rows(report):
record = {}
dimensions = row.get("dimensions", [])
metrics = row.get("metrics", [])
for header, dimension in zip(dimension_headers, dimensions):
data_type = self.lookup_data_type("dimension", header)
value = self.convert_to_type(header, dimension, data_type)
record[header.replace("ga:", "ga_")] = value
for i, values in enumerate(metrics):
for metric_header, value in zip(metric_headers, values.get("values")):
metric_name = metric_header.get("name")
metric_type = self.lookup_data_type("metric", metric_name)
value = self.convert_to_type(metric_name, value, metric_type)
record[metric_name.replace("ga:", "ga_")] = value
record["view_id"] = self.view_id
record["isDataGolden"] = report.get("data", {}).get("isDataGolden", False)
yield record
def check_for_sampled_result(self, data: Mapping) -> None:
if not data.get("isDataGolden", False):
self.logger.warning(DATA_IS_NOT_GOLDEN_MSG)
if data.get("samplesReadCounts", False):
self.logger.warning(RESULT_IS_SAMPLED_MSG)
class GoogleAnalyticsV4IncrementalObjectsBase(GoogleAnalyticsV4Stream):
cursor_field = "ga_date"
def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:
return {self.cursor_field: max(latest_record.get(self.cursor_field, ""), current_stream_state.get(self.cursor_field, ""))}
def read_records(
self,
sync_mode: SyncMode,
cursor_field: List[str] = None,
stream_slice: Mapping[str, Any] = None,
stream_state: Mapping[str, Any] = None,
) -> Iterable[Mapping[str, Any]]:
if not stream_slice:
return []
return super().read_records(sync_mode, cursor_field, stream_slice, stream_state)
class GoogleAnalyticsServiceOauth2Authenticator(Oauth2Authenticator):
"""Request example for API token extraction:
curl --location --request POST
https://oauth2.googleapis.com/token?grant_type=urn:ietf:params:oauth:grant-type:jwt-bearer&assertion=signed_JWT
"""
def __init__(self, config: Mapping):
self.credentials_json = json.loads(config["credentials_json"])
self.client_email = self.credentials_json["client_email"]
self.scope = "https://www.googleapis.com/auth/analytics.readonly"
super().__init__(
token_refresh_endpoint="https://oauth2.googleapis.com/token",
client_secret=self.credentials_json["private_key"],
client_id=self.credentials_json["private_key_id"],
refresh_token=None,
)
def refresh_access_token(self) -> Tuple[str, int]:
"""
Calling the Google OAuth 2.0 token endpoint. Used for authorizing signed JWT.
Returns tuple with access token and token's time-to-live
"""
response_json = None
try:
response = requests.request(method="POST", url=self.token_refresh_endpoint, params=self.get_refresh_request_params())
response_json = response.json()
response.raise_for_status()
except requests.exceptions.RequestException as e:
if response_json and "error" in response_json:
raise Exception(
"Error refreshing access token {}. Error: {}; Error details: {}; Exception: {}".format(
response_json, response_json["error"], response_json["error_description"], e
)
) from e
raise Exception(f"Error refreshing access token: {e}") from e
else:
return response_json["access_token"], response_json["expires_in"]
def get_refresh_request_params(self) -> Mapping[str, Any]:
"""
Sign the JWT with RSA-256 using the private key found in service account JSON file.
"""
token_lifetime = 3600 # token lifetime is 1 hour
issued_at = time.time()
expiration_time = issued_at + token_lifetime
payload = {
"iss": self.client_email,
"sub": self.client_email,
"scope": self.scope,
"aud": self.token_refresh_endpoint,
"iat": issued_at,
"exp": expiration_time,
}
headers = {"kid": self.client_id}
signed_jwt = jwt.encode(payload, self.client_secret, headers=headers, algorithm="RS256")
return {"grant_type": "urn:ietf:params:oauth:grant-type:jwt-bearer", "assertion": str(signed_jwt)}
class TestStreamConnection(GoogleAnalyticsV4Stream):
"""
Test the connectivity and permissions to read the data from the stream.
Because of the nature of the connector, the streams are created dynamicaly.
We declare the static stream like this to be able to test out the prmissions to read the particular view_id."""
page_size = 1
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
"""For test reading pagination is not required"""
return None
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]:
"""
Override this method to fetch records from start_date up to now for testing case
"""
start_date = pendulum.parse(self.start_date).date()
end_date = pendulum.now().date()
return [{"startDate": self.to_datetime_str(start_date), "endDate": self.to_datetime_str(end_date)}]
def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable[Mapping]:
res = response.json()
return res.get("reports", {})[0].get("data")
class SourceGoogleAnalyticsV4(AbstractSource):
"""Google Analytics lets you analyze data about customer engagement with your website or application."""
@staticmethod
def get_authenticator(config: Mapping) -> Oauth2Authenticator:
# backwards compatibility, credentials_json used to be in the top level of the connector
if config.get("credentials_json"):
return GoogleAnalyticsServiceOauth2Authenticator(config)
auth_params = config["credentials"]
if auth_params["auth_type"] == "Service" or auth_params.get("credentials_json"):
return GoogleAnalyticsServiceOauth2Authenticator(auth_params)
else:
return Oauth2Authenticator(
token_refresh_endpoint="https://oauth2.googleapis.com/token",
client_secret=auth_params["client_secret"],
client_id=auth_params["client_id"],
refresh_token=auth_params["refresh_token"],
scopes=["https://www.googleapis.com/auth/analytics.readonly"],
)
def check_connection(self, logger: logging.Logger, config: MutableMapping) -> Tuple[bool, Any]:
# declare additional variables
authenticator = self.get_authenticator(config)
config["authenticator"] = authenticator
config["metrics"] = ["ga:hits"]
config["dimensions"] = ["ga:date"]
# load and verify the custom_reports
try:
# test the eligibility of custom_reports input
custom_reports = config.get("custom_reports")
if custom_reports:
CustomReportsValidator(json.loads(custom_reports)).validate()
# Read records to check the reading permissions
read_check = list(TestStreamConnection(config).read_records(sync_mode=None))
if read_check:
return True, None
return (
False,
f"Please check the permissions for the requested view_id: {config['view_id']}. Cannot retrieve data from that view ID.",
)
except ValueError as e:
return False, f"Invalid custom reports json structure. {e}"
except requests.exceptions.RequestException as e:
error_msg = e.response.json().get("error")
if e.response.status_code == 403:
return False, f"Please check the permissions for the requested view_id: {config['view_id']}. {error_msg}"
else:
return False, f"{error_msg}"
def streams(self, config: MutableMapping[str, Any]) -> List[Stream]:
streams: List[GoogleAnalyticsV4Stream] = []
authenticator = self.get_authenticator(config)
config["authenticator"] = authenticator
reports = json.loads(pkgutil.get_data("source_google_analytics_v4", "defaults/default_reports.json"))
custom_reports = config.get("custom_reports")
if custom_reports:
custom_reports = json.loads(custom_reports)
custom_reports = [custom_reports] if not isinstance(custom_reports, list) else custom_reports
reports += custom_reports
config["ga_streams"] = reports
for stream in config["ga_streams"]:
config["metrics"] = stream["metrics"]
config["dimensions"] = stream["dimensions"]
config["segments"] = stream.get("segments", list())
config["filter"] = stream.get("filter", "")
# construct GAReadStreams sub-class for each stream
stream_name = stream["name"]
stream_bases = (GoogleAnalyticsV4Stream,)
if "ga:date" in stream["dimensions"]:
stream_bases = (GoogleAnalyticsV4IncrementalObjectsBase,)
stream_class = type(stream_name, stream_bases, {})
# instantiate a stream with config
stream_instance = stream_class(config)
streams.append(stream_instance)
return streams