1
0
mirror of synced 2026-01-18 06:04:45 -05:00
Files
airbyte/airbyte-integrations/connectors/source-youtube-analytics/source_youtube_analytics/source.py
Cole Snodgrass 2e099acc52 update headers from 2022 -> 2023 (#22594)
* It's 2023!

* 2022 -> 2023

---------

Co-authored-by: evantahler <evan@airbyte.io>
2023-02-08 13:01:16 -08:00

303 lines
13 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import csv
import datetime
import io
import json
import pkgutil
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple
import pendulum
import requests
from airbyte_cdk.sources import AbstractSource
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream
from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator
from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
class CustomBackoffMixin:
def daily_quota_exceeded(self, response: requests.Response) -> bool:
"""Response example:
{
"error": {
"code": 429,
"message": "Quota exceeded for quota metric 'Free requests' and limit 'Free requests per minute' of service 'youtubereporting.googleapis.com' for consumer 'project_number:863188056127'.",
"status": "RESOURCE_EXHAUSTED",
"details": [
{
"reason": "RATE_LIMIT_EXCEEDED",
"metadata": {
"consumer": "projects/863188056127",
"quota_limit": "FreeQuotaRequestsPerMinutePerProject",
"quota_limit_value": "60",
"quota_metric": "youtubereporting.googleapis.com/free_quota_requests",
"service": "youtubereporting.googleapis.com",
}
},
]
}
}
:param response:
:return:
"""
details = response.json().get("error", {}).get("details", [])
for detail in details:
if detail.get("reason") == "RATE_LIMIT_EXCEEDED":
if detail.get("metadata", {}).get("quota_limit") == "FreeQuotaRequestsPerDayPerProject":
self.logger.error(f"Exceeded daily quota: {detail.get('metadata', {}).get('quota_limit_value')} reqs/day")
return True
break
return False
def should_retry(self, response: requests.Response) -> bool:
"""
Override to set different conditions for backoff based on the response from the server.
By default, back off on the following HTTP response statuses:
- 500s to handle transient server errors
- 429 (Too Many Requests) indicating rate limiting:
Different behavior in case of 'RATE_LIMIT_EXCEEDED':
Requests Per Minute:
"message": "Quota exceeded for quota metric 'Free requests' and limit 'Free requests per minute' of service 'youtubereporting.googleapis.com' for consumer 'project_number:863188056127'."
"quota_limit": "FreeQuotaRequestsPerMinutePerProject",
"quota_limit_value": "60",
--> use increased retry_factor (30 seconds)
Requests Per Day:
"message": "Quota exceeded for quota metric 'Free requests' and limit 'Free requests per day' of service 'youtubereporting.googleapis.com' for consumer 'project_number:863188056127"
"quota_limit": "FreeQuotaRequestsPerDayPerProject
"quota_limit_value": "20000",
--> just throw an error, next scan is reasonable to start only in 1 day.
"""
if 500 <= response.status_code < 600:
return True
if response.status_code == 429 and not self.daily_quota_exceeded(response):
return True
return False
@property
def retry_factor(self) -> float:
"""
Default FreeQuotaRequestsPerMinutePerProject is 60 reqs/min, so reasonable delay is 30 seconds
"""
return 30
class JobsResource(CustomBackoffMixin, HttpStream):
"""
https://developers.google.com/youtube/reporting/v1/reference/rest/v1/jobs
All YouTube Analytics streams require a created reporting job.
This class allows to `list` all existing reporting jobs or `create` new reporting job for a specific stream. One stream can have only one reporting job.
By creating a reporting job, you are instructing YouTube to generate stream data on a daily basis. If reporting job is removed YouTube removes all stream data.
On every connector invocation, it gets a list of all running reporting jobs, if the currently processed stream has a reporting job - connector does nothing,
but if the currently processed stream does not have a job connector immediately creates one. This connector does not store IDs of reporting jobs.
If the reporting job was created by the user separately, this connector just uses that job. This connector does not remove reporting jobs it can only create them.
After reporting job is created, the first data can be available only after up to 48 hours.
"""
name = None
primary_key = None
http_method = None
raise_on_http_errors = True
url_base = "https://youtubereporting.googleapis.com/v1/"
JOB_NAME = "Airbyte reporting job"
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
return None
def should_retry(self, response: requests.Response) -> bool:
# if the connected Google account is not bounded with target Youtube account,
# we receive `401: UNAUTHENTICATED`
if response.status_code == 401:
setattr(self, "raise_on_http_errors", False)
return False
else:
return super().should_retry(response)
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
return [response.json()]
def path(self, **kwargs) -> str:
return "jobs"
def request_body_json(self, **kwargs) -> Optional[Mapping]:
if self.name:
return {"name": self.JOB_NAME, "reportTypeId": self.name}
def list(self):
"https://developers.google.com/youtube/reporting/v1/reference/rest/v1/jobs/list"
self.name = None
self.http_method = "GET"
results = list(self.read_records(sync_mode=None))
result = results[0]
return result.get("jobs", {})
def create(self, name):
"https://developers.google.com/youtube/reporting/v1/reference/rest/v1/jobs/create"
self.name = name
self.http_method = "POST"
results = list(self.read_records(sync_mode=None))
result = results[0]
return result["id"]
class ReportResources(CustomBackoffMixin, HttpStream):
"https://developers.google.com/youtube/reporting/v1/reference/rest/v1/jobs.reports/list"
name = None
primary_key = "id"
url_base = "https://youtubereporting.googleapis.com/v1/"
def __init__(self, name: str, jobs_resource: JobsResource, job_id: str, start_time: str = None, **kwargs):
self.name = name
self.jobs_resource = jobs_resource
self.job_id = job_id
self.start_time = start_time
super().__init__(**kwargs)
def path(
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
) -> str:
if not self.job_id:
self.job_id = self.jobs_resource.create(self.name)
self.logger.info(f"YouTube reporting job is created: '{self.job_id}'")
return "jobs/{}/reports".format(self.job_id)
def request_params(
self,
stream_state: Mapping[str, Any],
stream_slice: Mapping[str, Any] = None,
next_page_token: Mapping[str, Any] = None,
) -> MutableMapping[str, Any]:
return {"startTimeAtOrAfter": self.start_time} if self.start_time else {}
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
return None
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
response_json = response.json()
reports = []
for report in response_json.get("reports", []):
report = {**report}
report["startTime"] = datetime.datetime.strptime(report["startTime"], "%Y-%m-%dT%H:%M:%S%z")
reports.append(report)
reports.sort(key=lambda x: x["startTime"])
date = kwargs["stream_state"].get("date")
if date:
reports = [r for r in reports if int(r["startTime"].date().strftime("%Y%m%d")) > date]
if not reports:
reports.append(None)
return reports
class ChannelReports(CustomBackoffMixin, HttpSubStream):
"https://developers.google.com/youtube/reporting/v1/reports/channel_reports"
name = None
primary_key = None
cursor_field = "date"
url_base = ""
transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
def __init__(self, name: str, dimensions: List[str], **kwargs):
self.name = name
self.primary_key = dimensions
super().__init__(**kwargs)
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
return None
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
fp = io.StringIO(response.text)
reader = csv.DictReader(fp)
for record in reader:
yield record
def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:
if not current_stream_state:
return {self.cursor_field: latest_record[self.cursor_field]}
return {self.cursor_field: max(current_stream_state[self.cursor_field], latest_record[self.cursor_field])}
def path(
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
) -> str:
return stream_slice["parent"]["downloadUrl"]
def read_records(self, *, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
parent = stream_slice.get("parent")
if parent:
yield from super().read_records(stream_slice=stream_slice, **kwargs)
else:
self.logger.info("no data from parent stream")
yield from []
class SourceYoutubeAnalytics(AbstractSource):
@staticmethod
def get_authenticator(config):
credentials = config["credentials"]
client_id = credentials["client_id"]
client_secret = credentials["client_secret"]
refresh_token = credentials["refresh_token"]
return Oauth2Authenticator(
token_refresh_endpoint="https://oauth2.googleapis.com/token",
client_id=client_id,
client_secret=client_secret,
refresh_token=refresh_token,
)
def check_connection(self, logger, config) -> Tuple[bool, any]:
authenticator = self.get_authenticator(config)
jobs_resource = JobsResource(authenticator=authenticator)
result = jobs_resource.list()
if result:
return True, None
else:
return (
False,
"The Youtube account is not valid. Please make sure you're trying to use the active Youtube Account connected to your Google Account.",
)
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
authenticator = self.get_authenticator(config)
jobs_resource = JobsResource(authenticator=authenticator)
jobs = jobs_resource.list()
report_to_job_id = {j["reportTypeId"]: j["id"] for j in jobs}
# By default, API returns reports for last 60 days. Report for each day requires a separate request.
# Full scan of all 18 streams requires ~ 1100 requests (18+18*60), so we can hit 'default' API quota limits:
# - 60 reqs per minute
# - 20000 reqs per day
# For SAT: scan only last N days ('testing_period' option) in order to decrease a number of requests and avoid API limits
start_time = None
testing_period = config.get("testing_period")
if testing_period:
start_time = pendulum.today().add(days=-int(testing_period)).to_rfc3339_string()
channel_reports = json.loads(pkgutil.get_data("source_youtube_analytics", "defaults/channel_reports.json"))
streams = []
for channel_report in channel_reports:
stream_name = channel_report["id"]
dimensions = channel_report["dimensions"]
job_id = report_to_job_id.get(stream_name)
parent = ReportResources(
name=stream_name, jobs_resource=jobs_resource, job_id=job_id, start_time=start_time, authenticator=authenticator
)
streams.append(ChannelReports(name=stream_name, dimensions=dimensions, parent=parent, authenticator=authenticator))
return streams