1
0
mirror of synced 2026-01-10 18:02:07 -05:00
Files
Daryna Ishchenko a08911e4ce Source Google Search Console: add new streams (#27831)
* Source Google Search Console: Add new streams

* added new streams

* updated changelog

* updated minor version

* updated SAT

---------

Co-authored-by: Artem Inzhyyants <artem.inzhyyants@gmail.com>
2023-06-30 14:55:21 +03:00

186 lines
7.5 KiB
Python
Executable File

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
from typing import Any, List, Mapping, Optional, Tuple
from urllib.parse import urlparse
import jsonschema
import pendulum
import requests
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import SyncMode
from airbyte_cdk.sources import AbstractSource
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.http.auth import Oauth2Authenticator
from source_google_search_console.exceptions import InvalidSiteURLValidationError
from source_google_search_console.service_account_authenticator import ServiceAccountAuthenticator
from source_google_search_console.streams import (
SearchAnalyticsAllFields,
SearchAnalyticsByCountry,
SearchAnalyticsByCustomDimensions,
SearchAnalyticsByDate,
SearchAnalyticsByDevice,
SearchAnalyticsByPage,
SearchAnalyticsByQuery,
SearchAnalyticsKeywordPageReport,
SearchAnalyticsKeywordSiteReportByPage,
SearchAnalyticsPageReport,
SearchAnalyticsSiteReportByPage,
SearchAnalyticsSiteReportBySite,
Sitemaps,
Sites,
)
custom_reports_schema = {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string", "minLength": 1},
"dimensions": {"type": "array", "items": {"type": "string", "minLength": 1}},
},
"required": ["name", "dimensions"],
},
}
class SourceGoogleSearchConsole(AbstractSource):
@staticmethod
def normalize_url(url):
parse_result = urlparse(url)
if parse_result.path == "":
parse_result = parse_result._replace(path="/")
return parse_result.geturl()
def _validate_and_transform(self, config: Mapping[str, Any]):
authorization = config["authorization"]
if authorization["auth_type"] == "Service":
try:
authorization["service_account_info"] = json.loads(authorization["service_account_info"])
except ValueError:
raise Exception("authorization.service_account_info is not valid JSON")
if "custom_reports" in config:
try:
config["custom_reports"] = json.loads(config["custom_reports"])
except ValueError:
raise Exception("custom_reports is not valid JSON")
jsonschema.validate(config["custom_reports"], custom_reports_schema)
for report in config["custom_reports"]:
for dimension in report["dimensions"]:
if dimension not in SearchAnalyticsByCustomDimensions.dimension_to_property_schema_map:
raise Exception(f"dimension: '{dimension}' not found")
pendulum.parse(config["start_date"])
end_date = config.get("end_date")
if end_date:
pendulum.parse(end_date)
config["end_date"] = end_date or pendulum.now().to_date_string()
config["site_urls"] = [self.normalize_url(url) for url in config["site_urls"]]
config["data_state"] = config.get("data_state", "final")
return config
def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, Any]:
try:
config = self._validate_and_transform(config)
stream_kwargs = self.get_stream_kwargs(config)
self.validate_site_urls(config["site_urls"], stream_kwargs["authenticator"])
sites = Sites(**stream_kwargs)
stream_slice = sites.stream_slices(SyncMode.full_refresh)
# stream_slice returns all site_urls and we need to make sure that
# the connection is successful for all of them
for _slice in stream_slice:
sites_gen = sites.read_records(sync_mode=SyncMode.full_refresh, stream_slice=_slice)
next(sites_gen)
return True, None
except (InvalidSiteURLValidationError, jsonschema.ValidationError) as e:
return False, repr(e)
except Exception as error:
return (
False,
f"Unable to connect to Google Search Console API with the provided credentials - {repr(error)}",
)
@staticmethod
def validate_site_urls(site_urls, auth):
if isinstance(auth, ServiceAccountAuthenticator):
request = auth(requests.Request(method="GET", url="https://www.googleapis.com/webmasters/v3/sites"))
with requests.Session() as s:
response = s.send(s.prepare_request(request))
else:
response = requests.get("https://www.googleapis.com/webmasters/v3/sites", headers=auth.get_auth_header())
response_data = response.json()
if response.status_code != 200:
raise Exception(f"Unable to connect to Google Search Console API - {response_data}")
remote_site_urls = {s["siteUrl"] for s in response_data["siteEntry"]}
invalid_site_url = set(site_urls) - remote_site_urls
if invalid_site_url:
raise InvalidSiteURLValidationError(f'The following URLs are not permitted: {", ".join(invalid_site_url)}')
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
"""
:param config: A Mapping of the user input configuration as defined in the connector spec.
"""
config = self._validate_and_transform(config)
stream_config = self.get_stream_kwargs(config)
streams = [
Sites(**stream_config),
Sitemaps(**stream_config),
SearchAnalyticsByCountry(**stream_config),
SearchAnalyticsByDevice(**stream_config),
SearchAnalyticsByDate(**stream_config),
SearchAnalyticsByQuery(**stream_config),
SearchAnalyticsByPage(**stream_config),
SearchAnalyticsAllFields(**stream_config),
SearchAnalyticsKeywordPageReport(**stream_config),
SearchAnalyticsPageReport(**stream_config),
SearchAnalyticsSiteReportBySite(**stream_config),
SearchAnalyticsSiteReportByPage(**stream_config),
SearchAnalyticsKeywordSiteReportByPage(**stream_config),
]
streams = streams + self.get_custom_reports(config=config, stream_config=stream_config)
return streams
def get_custom_reports(self, config: Mapping[str, Any], stream_config: Mapping[str, Any]) -> List[Optional[Stream]]:
return [
type(report["name"], (SearchAnalyticsByCustomDimensions,), {})(dimensions=report["dimensions"], **stream_config)
for report in config.get("custom_reports", [])
]
def get_stream_kwargs(self, config: Mapping[str, Any]) -> Mapping[str, Any]:
return {
"site_urls": config["site_urls"],
"start_date": config["start_date"],
"end_date": config["end_date"],
"authenticator": self.get_authenticator(config),
"data_state": config["data_state"],
}
def get_authenticator(self, config):
authorization = config["authorization"]
auth_type = authorization["auth_type"]
if auth_type == "Client":
return Oauth2Authenticator(
token_refresh_endpoint="https://oauth2.googleapis.com/token",
client_secret=authorization["client_secret"],
client_id=authorization["client_id"],
refresh_token=authorization["refresh_token"],
)
elif auth_type == "Service":
return ServiceAccountAuthenticator(
service_account_info=authorization["service_account_info"],
email=authorization["email"],
)