1
0
mirror of synced 2026-01-01 00:02:54 -05:00
Files
airbyte/airbyte-integrations/connectors/source-sendgrid/source_sendgrid/streams.py
Marcos Marx a89fd05187 Source Sendgrid: add unsub groups stream (#26314)
* add unsubscribe groups stream

* add doc

* fix tests

* fix tests

* fix

* fix update version

* Automated Change

* Delete oss_registry.json

* add expected records

* add eof

* small fix

---------

Co-authored-by: Haithem Souala <haithem.souala@woopit.fr>
2023-05-23 10:03:06 -03:00

423 lines
16 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import math
import os
import time
import zlib
from abc import ABC, abstractmethod
from contextlib import closing
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple
from urllib.parse import urlparse
import pandas as pd
import pendulum
import requests
from airbyte_cdk.models import SyncMode
from airbyte_cdk.sources.streams.http import HttpStream
from airbyte_cdk.sources.streams.http.rate_limiting import default_backoff_handler
from numpy import nan
from pendulum import DateTime
from requests import codes, exceptions
class SendgridStream(HttpStream, ABC):
url_base = "https://api.sendgrid.com/v3/"
primary_key = "id"
limit = 50
data_field = None
raise_on_http_errors = True
permission_error_codes = {
400: "authorization required",
401: "authorization required",
}
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
pass
def parse_response(
self,
response: requests.Response,
stream_state: Mapping[str, Any] = None,
stream_slice: Mapping[str, Any] = None,
next_page_token: Mapping[str, Any] = None,
) -> Iterable[Mapping]:
json_response = response.json()
records = json_response.get(self.data_field, []) if self.data_field is not None else json_response
if records is not None:
for record in records:
yield record
else:
# TODO sendgrid's API is sending null responses at times. This seems like a bug on the API side, so we're adding
# log statements to help reproduce and prevent the connector from failing.
err_msg = (
f"Response contained no valid JSON data. Response body: {response.text}\n"
f"Response status: {response.status_code}\n"
f"Response body: {response.text}\n"
f"Response headers: {response.headers}\n"
f"Request URL: {response.request.url}\n"
f"Request body: {response.request.body}\n"
)
# do NOT print request headers as it contains auth token
self.logger.info(err_msg)
def should_retry(self, response: requests.Response) -> bool:
"""Override to provide skip the stream possibility"""
status = response.status_code
if status in self.permission_error_codes.keys():
for message in response.json().get("errors", []):
if message.get("message") == self.permission_error_codes.get(status):
self.logger.error(
f"Stream `{self.name}` is not available, due to subscription plan limitations or perrmission issues. Skipping."
)
setattr(self, "raise_on_http_errors", False)
return False
return 500 <= response.status_code < 600
class SendgridStreamOffsetPagination(SendgridStream):
offset = 0
def request_params(self, next_page_token: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]:
params = super().request_params(next_page_token=next_page_token, **kwargs)
params["limit"] = self.limit
if next_page_token:
params.update(**next_page_token)
return params
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
stream_data = response.json()
if self.data_field:
stream_data = stream_data[self.data_field]
if len(stream_data) < self.limit:
return
self.offset += self.limit
return {"offset": self.offset}
class SendgridStreamIncrementalMixin(HttpStream, ABC):
cursor_field = "created"
def __init__(self, start_time: Optional[str], **kwargs):
super().__init__(**kwargs)
self._start_time = start_time or 0
if isinstance(self._start_time, str):
self._start_time = int(pendulum.parse(self._start_time).timestamp())
def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:
"""
Return the latest state by comparing the cursor value in the latest record with the stream's most recent state object
and returning an updated state object.
"""
latest_benchmark = latest_record[self.cursor_field]
if current_stream_state.get(self.cursor_field):
return {self.cursor_field: max(latest_benchmark, current_stream_state[self.cursor_field])}
return {self.cursor_field: latest_benchmark}
def request_params(self, stream_state: Mapping[str, Any], **kwargs) -> MutableMapping[str, Any]:
params = super().request_params(stream_state=stream_state)
start_time = self._start_time
if stream_state.get(self.cursor_field):
start_time = stream_state[self.cursor_field]
params.update({"start_time": start_time, "end_time": pendulum.now().int_timestamp})
return params
class SendgridStreamMetadataPagination(SendgridStream):
def request_params(
self,
stream_state: Mapping[str, Any],
stream_slice: Mapping[str, Any] = None,
next_page_token: Mapping[str, Any] = None,
) -> MutableMapping[str, Any]:
params = {}
if not next_page_token:
params = {"page_size": self.limit}
return params
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
next_page_url = response.json()["_metadata"].get("next", False)
if next_page_url:
return {"next_page_url": next_page_url.replace(self.url_base, "")}
@staticmethod
@abstractmethod
def initial_path() -> str:
"""
:return: initial path for the API endpoint if no next metadata url found
"""
def path(
self,
stream_state: Mapping[str, Any] = None,
stream_slice: Mapping[str, Any] = None,
next_page_token: Mapping[str, Any] = None,
) -> str:
if next_page_token:
return next_page_token["next_page_url"]
return self.initial_path()
class Scopes(SendgridStream):
def path(self, **kwargs) -> str:
return "scopes"
class Lists(SendgridStreamMetadataPagination):
data_field = "result"
@staticmethod
def initial_path() -> str:
return "marketing/lists"
class Campaigns(SendgridStreamMetadataPagination):
data_field = "result"
@staticmethod
def initial_path() -> str:
return "marketing/campaigns"
class Contacts(SendgridStream):
primary_key = "contact_id"
MAX_RETRY_NUMBER = 3
DEFAULT_WAIT_TIMEOUT_SECONDS = 60
MAX_CHECK_INTERVAL_SECONDS = 2.0
encoding = "utf-8"
def path(self, **kwargs) -> str:
return "marketing/contacts/exports"
@default_backoff_handler(max_tries=5, factor=15)
def _send_http_request(self, method: str, url: str, stream: bool = False, enable_auth: bool = True):
headers = self.authenticator.get_auth_header() if enable_auth else None
response = self._session.request(method, url=url, headers=headers, stream=stream)
if response.status_code not in [200, 202]:
self.logger.error(f"error body: {response.text}")
response.raise_for_status()
return response
def read_records(
self,
sync_mode: SyncMode,
cursor_field: List[str] = None,
stream_slice: Mapping[str, Any] = None,
stream_state: Mapping[str, Any] = None,
) -> Iterable[Mapping[str, Any]]:
csv_urls, job_status = self.execute_export_job(url=f"{self.url_base}{self.path()}")
if job_status == "failed":
raise Exception(f"Export Job failed for more than 3 times, skipping reading stream {self.name}")
for url in csv_urls:
for record in self.read_with_chunks(*self.download_data(url=url)):
yield record
def execute_export_job(self, url: str) -> Tuple[Optional[str], Optional[str]]:
job_status = "failed"
for i in range(0, self.MAX_RETRY_NUMBER):
job_id = self.create_export_job(url=url)
if not job_id:
return None, job_status
job_full_url = f"{url}/{job_id}"
urls, job_status = self.wait_for_job(url=job_full_url)
if urls:
break
self.logger.error(f"Waiting error. Try to run this job again {i + 1}/{self.MAX_RETRY_NUMBER}...")
return urls, job_status
def create_export_job(self, url: str) -> Optional[str]:
"""
docs: https://docs.sendgrid.com/api-reference/contacts/export-contacts
"""
try:
response = self._send_http_request("POST", url)
job_id: str = response.json().get("id")
return job_id
except exceptions.HTTPError as error:
if error.response.status_code in [codes.BAD_REQUEST, codes.UNAUTHORIZED, codes.FORBIDDEN, codes.NOT_FOUND, codes.SERVER_ERROR]:
error_data = error.response.json().get("errors")[0]
error_id = error_data.get("error_id")
error_message = error_data.get("message")
error_parameter = error_data.get("parameter")
self.logger.error(f"Cannot receive data for stream '{self.name}' ," f"{error_message=}, {error_id=}, {error_parameter=}")
else:
raise error
return None
def wait_for_job(self, url: str) -> Tuple[List[str], str]:
"""
docs: https://docs.sendgrid.com/api-reference/contacts/export-contacts-status
"""
expiration_time: DateTime = pendulum.now().add(seconds=self.DEFAULT_WAIT_TIMEOUT_SECONDS)
job_status = "pending"
urls: List[str] = []
delay_timeout = 0.0
delay_cnt = 0
job_info = None
time.sleep(0.5)
while pendulum.now() < expiration_time:
job_info = self._send_http_request("GET", url=url).json()
job_status = job_info.get("status")
urls = job_info.get("urls", [])
if job_status in ("ready", "failure"):
if job_status != "ready":
self.logger.error(f"JobStatus: {job_status}, error message: '{job_info}'")
return urls, job_status
if delay_timeout < self.MAX_CHECK_INTERVAL_SECONDS:
delay_timeout = 0.5 + math.exp(delay_cnt) / 1000.0
delay_cnt += 1
time.sleep(delay_timeout)
job_id = job_info["id"]
self.logger.info(
f"Sleeping {delay_timeout} seconds while waiting for Job: {self.name}/{job_id} to complete. Current state: {job_status}"
)
self.logger.warning(f"Not wait the {self.name} data for {self.DEFAULT_WAIT_TIMEOUT_SECONDS} seconds, data: {job_info}!!")
return urls, job_status
def download_data(self, url: str, chunk_size: int = 1024) -> tuple[str, str]:
"""
Retrieves binary data result from successfully `executed_job`, using chunks, to avoid local memory limitations.
Response received in .gzip binary format.
@ url: string - the url of the `executed_job`
@ chunk_size: int - the buffer size for each chunk to fetch from stream, in bytes, default: 1024 bytes
Return the tuple containing string with file path of downloaded binary data (Saved temporarily) and file encoding.
"""
# set filepath for binary data from response
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32)
url_parsed = urlparse(url)
tmp_file = os.path.realpath(os.path.basename(url_parsed.path[1:-5]))
with closing(self._send_http_request("GET", f"{url}", stream=True, enable_auth=False)) as response, open(
tmp_file, "wb"
) as data_file:
for chunk in response.iter_content(chunk_size=chunk_size):
data_file.write(decompressor.decompress(chunk))
# check the file exists
if os.path.isfile(tmp_file):
return tmp_file, self.encoding
else:
raise Exception(f"The IO/Error occured while verifying binary data. Stream: {self.name}, file {tmp_file} doesn't exist.")
def read_with_chunks(self, path: str, file_encoding: str, chunk_size: int = 100) -> Iterable[Tuple[int, Mapping[str, Any]]]:
"""
Reads the downloaded binary data, using lines chunks, set by `chunk_size`.
@ path: string - the path to the downloaded temporarily binary data.
@ file_encoding: string - encoding for binary data file according to Standard Encodings from codecs module
@ chunk_size: int - the number of lines to read at a time, default: 100 lines / time.
"""
try:
with open(path, "r", encoding=file_encoding) as data:
chunks = pd.read_csv(data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=str)
for chunk in chunks:
chunk = ({k.lower(): v for k, v in x.items()} for x in chunk.replace({nan: None}).to_dict(orient="records"))
for row in chunk:
yield row
except pd.errors.EmptyDataError as e:
self.logger.info(f"Empty data received. {e}")
yield from []
except IOError as ioe:
raise Exception(f"The IO/Error occured while reading tmp data. Called: {path}. Stream: {self.name}", ioe)
finally:
# remove binary tmp file, after data is read
os.remove(path)
class StatsAutomations(SendgridStreamMetadataPagination):
data_field = "results"
@staticmethod
def initial_path() -> str:
return "marketing/stats/automations"
class Segments(SendgridStream):
data_field = "results"
def path(self, **kwargs) -> str:
return "marketing/segments"
class SingleSends(SendgridStreamMetadataPagination):
"""
https://docs.sendgrid.com/api-reference/marketing-campaign-stats/get-all-single-sends-stats
"""
data_field = "results"
@staticmethod
def initial_path() -> str:
return "marketing/stats/singlesends"
class Templates(SendgridStreamMetadataPagination):
data_field = "result"
def request_params(self, next_page_token: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]:
params = super().request_params(next_page_token=next_page_token, **kwargs)
params["generations"] = "legacy,dynamic"
return params
@staticmethod
def initial_path() -> str:
return "templates"
class GlobalSuppressions(SendgridStreamOffsetPagination, SendgridStreamIncrementalMixin):
primary_key = "email"
def path(self, **kwargs) -> str:
return "suppression/unsubscribes"
class SuppressionGroups(SendgridStream):
def path(self, **kwargs) -> str:
return "asm/groups"
class SuppressionGroupMembers(SendgridStreamOffsetPagination):
primary_key = "group_id"
def path(self, **kwargs) -> str:
return "asm/suppressions"
class Blocks(SendgridStreamOffsetPagination, SendgridStreamIncrementalMixin):
primary_key = "email"
def path(self, **kwargs) -> str:
return "suppression/blocks"
class Bounces(SendgridStream, SendgridStreamIncrementalMixin):
primary_key = "email"
def path(self, **kwargs) -> str:
return "suppression/bounces"
class InvalidEmails(SendgridStreamOffsetPagination, SendgridStreamIncrementalMixin):
primary_key = "email"
def path(self, **kwargs) -> str:
return "suppression/invalid_emails"
class SpamReports(SendgridStreamOffsetPagination, SendgridStreamIncrementalMixin):
primary_key = "email"
def path(self, **kwargs) -> str:
return "suppression/spam_reports"
class UnsubscribeGroups(SendgridStream):
def path(self, **kwargs) -> str:
return "asm/groups"