* added discover and googleNews to searchType * updated changelog * fixed types for streams
424 lines
16 KiB
Python
Executable File
424 lines
16 KiB
Python
Executable File
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
from abc import ABC
|
|
from enum import Enum
|
|
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
|
|
from urllib.parse import quote_plus, unquote_plus
|
|
|
|
import pendulum
|
|
import requests
|
|
from airbyte_cdk.models import SyncMode
|
|
from airbyte_cdk.sources.streams.http import HttpStream
|
|
from airbyte_cdk.sources.streams.http.auth import HttpAuthenticator
|
|
|
|
BASE_URL = "https://www.googleapis.com/webmasters/v3/"
|
|
ROW_LIMIT = 25000
|
|
|
|
|
|
class QueryAggregationType(Enum):
|
|
auto = "auto"
|
|
by_page = "byPage"
|
|
by_property = "byProperty"
|
|
|
|
|
|
class GoogleSearchConsole(HttpStream, ABC):
|
|
url_base = BASE_URL
|
|
primary_key = None
|
|
data_field = ""
|
|
|
|
def __init__(
|
|
self,
|
|
authenticator: Union[HttpAuthenticator, requests.auth.AuthBase],
|
|
site_urls: list,
|
|
start_date: str,
|
|
end_date: str,
|
|
data_state: str = "final",
|
|
):
|
|
super().__init__(authenticator=authenticator)
|
|
self._site_urls = self.sanitize_urls_list(site_urls)
|
|
self._start_date = start_date
|
|
self._end_date = end_date
|
|
self._data_state = data_state
|
|
|
|
@staticmethod
|
|
def sanitize_urls_list(site_urls: list) -> List[str]:
|
|
return list(map(quote_plus, site_urls))
|
|
|
|
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
|
|
return None
|
|
|
|
def stream_slices(
|
|
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
|
|
) -> Iterable[Optional[Mapping[str, Any]]]:
|
|
for site_url in self._site_urls:
|
|
yield {"site_url": site_url}
|
|
|
|
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
|
|
if not self.data_field:
|
|
yield response.json()
|
|
|
|
else:
|
|
records = response.json().get(self.data_field) or []
|
|
for record in records:
|
|
yield record
|
|
|
|
|
|
class Sites(GoogleSearchConsole):
|
|
"""
|
|
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/sites
|
|
"""
|
|
|
|
def path(
|
|
self,
|
|
stream_state: Mapping[str, Any] = None,
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> str:
|
|
return f"sites/{stream_slice.get('site_url')}"
|
|
|
|
|
|
class Sitemaps(GoogleSearchConsole):
|
|
"""
|
|
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/sitemaps
|
|
"""
|
|
|
|
data_field = "sitemap"
|
|
|
|
def path(
|
|
self,
|
|
stream_state: Mapping[str, Any] = None,
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> str:
|
|
return f"sites/{stream_slice.get('site_url')}/sitemaps"
|
|
|
|
|
|
class SearchAnalytics(GoogleSearchConsole, ABC):
|
|
"""
|
|
API docs: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics
|
|
"""
|
|
|
|
data_field = "rows"
|
|
aggregation_type = QueryAggregationType.auto
|
|
start_row = 0
|
|
dimensions = []
|
|
search_types = ["web", "news", "image", "video"]
|
|
range_of_days = 3
|
|
|
|
def path(
|
|
self,
|
|
stream_state: Mapping[str, Any] = None,
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> str:
|
|
return f"sites/{stream_slice.get('site_url')}/searchAnalytics/query"
|
|
|
|
@property
|
|
def cursor_field(self) -> Union[str, List[str]]:
|
|
return "date"
|
|
|
|
@property
|
|
def http_method(self) -> str:
|
|
return "POST"
|
|
|
|
def stream_slices(
|
|
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
|
|
) -> Iterable[Optional[Mapping[str, Any]]]:
|
|
"""
|
|
The `stream_slices` implements iterator functionality for `site_urls` and `searchType`. The user can pass many `site_url`,
|
|
and we have to process all of them, we can also pass the` searchType` parameter in the `request body` to get data using some`
|
|
searchType` value from [` web`, `news `,` image`, `video`, `discover`, `googleNews`].
|
|
It's just a double nested loop with a yield statement.
|
|
"""
|
|
|
|
for site_url in self._site_urls:
|
|
for search_type in self.search_types:
|
|
start_date = self._get_start_date(stream_state, site_url, search_type)
|
|
end_date = self._get_end_date()
|
|
|
|
if start_date > end_date:
|
|
start_date = end_date
|
|
|
|
next_start = start_date
|
|
period = pendulum.Duration(days=self.range_of_days - 1)
|
|
while next_start <= end_date:
|
|
next_end = min(next_start + period, end_date)
|
|
yield {
|
|
"site_url": site_url,
|
|
"search_type": search_type,
|
|
"start_date": next_start.to_date_string(),
|
|
"end_date": next_end.to_date_string(),
|
|
"data_state": self._data_state,
|
|
}
|
|
# add 1 day for the next slice's start date not to duplicate data from previous slice's end date.
|
|
next_start = next_end + pendulum.Duration(days=1)
|
|
|
|
def next_page_token(self, response: requests.Response) -> Optional[bool]:
|
|
"""
|
|
The `next_page_token` implements pagination functionality. This method gets the response
|
|
and compares the number of records with the constant `ROW_LIMITS` (maximum value 25000),
|
|
and if they are equal, this means that we get the end of the` Page`, and we need to go further,
|
|
for this we simply increase the `startRow` parameter in request body by `ROW_LIMIT` value.
|
|
"""
|
|
|
|
if len(response.json().get(self.data_field, [])) == ROW_LIMIT:
|
|
self.start_row += ROW_LIMIT
|
|
return True
|
|
|
|
self.start_row = 0
|
|
|
|
def request_headers(self, **kwargs) -> Mapping[str, Any]:
|
|
return {"Content-Type": "application/json"}
|
|
|
|
def request_body_json(
|
|
self,
|
|
stream_state: Mapping[str, Any] = None,
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> Optional[Union[Dict[str, Any], str]]:
|
|
"""
|
|
Here is a description of the parameters and implementations of the request body:
|
|
1. The `startDate` is retrieved from the `_get_start_date`,
|
|
if` SyncMode = full_refresh` just use `start_date` from configuration, otherwise use `get_update_state`.
|
|
2. The `endDate` is retrieved from the `config.json`.
|
|
3. The `sizes` parameter is used to group the result by some dimension.
|
|
The following dimensions are available: `date`, `country`, `page`, `device`, `query`.
|
|
4. For the `type` check the paragraph stream_slices method.
|
|
Filter results to the following type ["web", "news", "image", "video", "discover", "googleNews"]
|
|
5. For the `startRow` and `rowLimit` check next_page_token method.
|
|
"""
|
|
|
|
data = {
|
|
"startDate": stream_slice["start_date"],
|
|
"endDate": stream_slice["end_date"],
|
|
"dimensions": self.dimensions,
|
|
"type": stream_slice.get("search_type"),
|
|
"aggregationType": self.aggregation_type.value,
|
|
"startRow": self.start_row,
|
|
"rowLimit": ROW_LIMIT,
|
|
"dataState": stream_slice.get("data_state"),
|
|
}
|
|
return data
|
|
|
|
def _get_end_date(self) -> pendulum.date:
|
|
end_date = pendulum.parse(self._end_date).date()
|
|
# limit `end_date` value with current date
|
|
return min(end_date, pendulum.now().date())
|
|
|
|
def _get_start_date(self, stream_state: Mapping[str, Any] = None, site_url: str = None, search_type: str = None) -> pendulum.date:
|
|
start_date = pendulum.parse(self._start_date)
|
|
|
|
if start_date and stream_state:
|
|
if stream_state.get(unquote_plus(site_url), {}).get(search_type):
|
|
stream_state_value = stream_state.get(unquote_plus(site_url), {}).get(search_type)
|
|
|
|
start_date = max(
|
|
pendulum.parse(stream_state_value[self.cursor_field]),
|
|
start_date,
|
|
)
|
|
|
|
return start_date.date()
|
|
|
|
def parse_response(
|
|
self,
|
|
response: requests.Response,
|
|
stream_state: Mapping[str, Any],
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> Iterable[Mapping]:
|
|
records = response.json().get(self.data_field) or []
|
|
|
|
for record in records:
|
|
record["site_url"] = unquote_plus(stream_slice.get("site_url"))
|
|
record["search_type"] = stream_slice.get("search_type")
|
|
|
|
for dimension in self.dimensions:
|
|
record[dimension] = record["keys"].pop(0)
|
|
|
|
# remove unnecessary empty field
|
|
record.pop("keys")
|
|
|
|
yield record
|
|
|
|
def get_updated_state(
|
|
self,
|
|
current_stream_state: MutableMapping[str, Any],
|
|
latest_record: Mapping[str, Any],
|
|
) -> Mapping[str, Any]:
|
|
"""
|
|
With the existing nested loop implementation, we have to store a `cursor_field` for each `site_url`
|
|
and `searchType`. This functionality is placed in `get_update_state`.
|
|
|
|
{
|
|
"stream": {
|
|
"https://domain1.com": {
|
|
"web": {"date": "2022-01-03"},
|
|
"news": {"date": "2022-01-03"},
|
|
"image": {"date": "2022-01-03"},
|
|
"video": {"date": "2022-01-03"}
|
|
},
|
|
"https://domain2.com": {
|
|
"web": {"date": "2022-01-03"},
|
|
"news": {"date": "2022-01-03"},
|
|
"image": {"date": "2022-01-03"},
|
|
"video": {"date": "2022-01-03"}
|
|
},
|
|
"date": "2022-01-03",
|
|
}
|
|
}
|
|
"""
|
|
|
|
latest_benchmark = latest_record[self.cursor_field]
|
|
|
|
site_url = latest_record.get("site_url")
|
|
search_type = latest_record.get("search_type")
|
|
|
|
value = current_stream_state.get(site_url, {}).get(search_type, {}).get(self.cursor_field)
|
|
if value:
|
|
latest_benchmark = max(latest_benchmark, value)
|
|
current_stream_state.setdefault(site_url, {}).setdefault(search_type, {})[self.cursor_field] = latest_benchmark
|
|
|
|
# we need to get the max date over all searchTypes but the current acceptance test YAML format doesn't
|
|
# support that
|
|
current_stream_state[self.cursor_field] = current_stream_state[site_url][search_type][self.cursor_field]
|
|
|
|
return current_stream_state
|
|
|
|
|
|
class SearchAnalyticsByDate(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "discover", "googleNews"]
|
|
dimensions = ["date"]
|
|
|
|
|
|
class SearchAnalyticsByCountry(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "discover", "googleNews"]
|
|
dimensions = ["date", "country"]
|
|
|
|
|
|
class SearchAnalyticsByDevice(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "googleNews"]
|
|
dimensions = ["date", "device"]
|
|
|
|
|
|
class SearchAnalyticsByPage(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "discover", "googleNews"]
|
|
dimensions = ["date", "page"]
|
|
|
|
|
|
class SearchAnalyticsByQuery(SearchAnalytics):
|
|
dimensions = ["date", "query"]
|
|
|
|
|
|
class SearchAnalyticsAllFields(SearchAnalytics):
|
|
dimensions = ["date", "country", "device", "page", "query"]
|
|
|
|
|
|
class SearchAppearance(SearchAnalytics):
|
|
"""
|
|
Dimension searchAppearance can't be used with other dimension.
|
|
search appearance data (AMP, blue link, rich result, and so on) must be queried using a two-step process.
|
|
https://developers.google.com/webmaster-tools/v1/how-tos/all-your-data#search-appearance-data
|
|
"""
|
|
|
|
dimensions = ["searchAppearance"]
|
|
|
|
|
|
class SearchByKeyword(SearchAnalytics):
|
|
"""
|
|
Adds searchAppearance value to dimensionFilterGroups in json body
|
|
https://developers.google.com/webmaster-tools/v1/how-tos/all-your-data#search-appearance-data
|
|
"""
|
|
|
|
def request_body_json(
|
|
self,
|
|
stream_state: Mapping[str, Any] = None,
|
|
stream_slice: Mapping[str, Any] = None,
|
|
next_page_token: Mapping[str, Any] = None,
|
|
) -> Optional[Union[Dict[str, Any], str]]:
|
|
data = super().request_body_json(stream_state, stream_slice, next_page_token)
|
|
|
|
stream = SearchAppearance(self.authenticator, self._site_urls, self._start_date, self._end_date)
|
|
keywords_records = stream.read_records(sync_mode=SyncMode.full_refresh, stream_state=stream_state, stream_slice=stream_slice)
|
|
keywords = {record["searchAppearance"] for record in keywords_records}
|
|
|
|
filters = []
|
|
for keyword in keywords:
|
|
filters.append({"dimension": "searchAppearance", "operator": "equals", "expression": keyword})
|
|
|
|
data["dimensionFilterGroups"] = [{"filters": filters}]
|
|
|
|
return data
|
|
|
|
|
|
class SearchAnalyticsKeywordPageReport(SearchByKeyword):
|
|
dimensions = ["date", "country", "device", "query", "page"]
|
|
|
|
|
|
class SearchAnalyticsKeywordSiteReportByPage(SearchByKeyword):
|
|
dimensions = ["date", "country", "device", "query"]
|
|
aggregation_type = QueryAggregationType.by_page
|
|
|
|
|
|
class SearchAnalyticsSiteReportBySite(SearchAnalytics):
|
|
dimensions = ["date", "country", "device"]
|
|
aggregation_type = QueryAggregationType.by_property
|
|
|
|
|
|
class SearchAnalyticsSiteReportByPage(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "googleNews"]
|
|
dimensions = ["date", "country", "device"]
|
|
aggregation_type = QueryAggregationType.by_page
|
|
|
|
|
|
class SearchAnalyticsPageReport(SearchAnalytics):
|
|
search_types = ["web", "news", "image", "video", "googleNews"]
|
|
dimensions = ["date", "country", "device", "page"]
|
|
|
|
|
|
class SearchAnalyticsByCustomDimensions(SearchAnalytics):
|
|
dimension_to_property_schema_map = {
|
|
"country": [{"country": {"type": ["null", "string"]}}],
|
|
"date": [],
|
|
"device": [{"device": {"type": ["null", "string"]}}],
|
|
"page": [{"page": {"type": ["null", "string"]}}],
|
|
"query": [{"query": {"type": ["null", "string"]}}],
|
|
}
|
|
|
|
def __init__(self, dimensions: List[str], *args, **kwargs):
|
|
super(SearchAnalyticsByCustomDimensions, self).__init__(*args, **kwargs)
|
|
self.dimensions = dimensions
|
|
|
|
def get_json_schema(self) -> Mapping[str, Any]:
|
|
try:
|
|
return super(SearchAnalyticsByCustomDimensions, self).get_json_schema()
|
|
except FileNotFoundError:
|
|
schema: Mapping[str, Any] = {
|
|
"$schema": "https://json-schema.org/draft-07/schema#",
|
|
"type": ["null", "object"],
|
|
"additionalProperties": True,
|
|
"properties": {
|
|
"clicks": {"type": ["null", "integer"]},
|
|
"ctr": {"type": ["null", "number"], "multipleOf": 1e-25},
|
|
"date": {"type": ["null", "string"], "format": "date"},
|
|
"impressions": {"type": ["null", "integer"]},
|
|
"position": {"type": ["null", "number"], "multipleOf": 1e-25},
|
|
"search_type": {"type": ["null", "string"]},
|
|
"site_url": {"type": ["null", "string"]},
|
|
},
|
|
}
|
|
|
|
dimension_properties = self.dimension_to_property_schema()
|
|
schema["properties"].update(dimension_properties)
|
|
|
|
return schema
|
|
|
|
def dimension_to_property_schema(self) -> dict:
|
|
properties = {}
|
|
for dimension in sorted(self.dimensions):
|
|
fields = self.dimension_to_property_schema_map[dimension]
|
|
for field in fields:
|
|
properties = {**properties, **field}
|
|
return properties
|