1
0
mirror of synced 2026-01-23 04:02:42 -05:00
Files
airbyte/airbyte-integrations/connectors/source-zuora/source_zuora/source.py
Cole Snodgrass 2e099acc52 update headers from 2022 -> 2023 (#22594)
* It's 2023!

* 2022 -> 2023

---------

Co-authored-by: evantahler <evan@airbyte.io>
2023-02-08 13:01:16 -08:00

535 lines
21 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
from abc import ABC
from datetime import datetime
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple
import pendulum
import requests
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import AirbyteStream, SyncMode
from airbyte_cdk.sources import AbstractSource
from airbyte_cdk.sources.streams.http import HttpStream
from .zuora_auth import ZuoraAuthenticator
from .zuora_errors import (
QueryWindowError,
ZOQLQueryCannotProcessObject,
ZOQLQueryFailed,
ZOQLQueryFieldCannotResolveAltCursor,
ZOQLQueryFieldCannotResolveCursor,
)
from .zuora_excluded_streams import ZUORA_EXCLUDED_STREAMS
class ZuoraStream(HttpStream, ABC):
"""
Parent class for all other classes, except of SourceZuora.
"""
# Define primary key
primary_key = "id"
# Define possible cursor_fields
cursor_field = "updateddate"
alt_cursor_field = "createddate"
def __init__(self, config: Dict):
super().__init__(authenticator=config["authenticator"])
self._config = config
@property
def url_base(self) -> str:
return self._config["url_base"]
@property
def window_in_days(self) -> float:
"""
Converting `Query Window` config parameter from string type into type float.
"""
try:
value = self._config["window_in_days"]
return float(value)
except ValueError:
raise QueryWindowError(value)
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
"""Abstractmethod HTTPStream CDK dependency"""
return None
def request_params(self, stream_state: Mapping[str, Any], **kwargs) -> MutableMapping[str, Any]:
"""Abstractmethod HTTPStream CDK dependency"""
return {}
def base_query_params(self) -> MutableMapping[str, Any]:
"""
Returns base query parameters for default CDK request_json_body method
"""
params = {"compression": "NONE", "output": {"target": "S3"}, "outputFormat": "JSON"}
if self._config["data_query"] == "Unlimited":
params["sourceData"] = "DATAHUB"
return params
class ZuoraBase(ZuoraStream):
"""
Base child class, provides main functionality for next classes:
- ZuoraObjectsBase, ZuoraListObjects, ZuoraDescribeObject
"""
def path(self, **kwargs) -> str:
"""Abstractmethod HTTPStream CDK dependency"""
return ""
def request_kwargs(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> Mapping[str, Any]:
"""
Override of default CDK method to return date_slices as stream_slices
"""
return stream_slice if stream_slice else {}
def get_zuora_data(self, date_slice: Dict, config: Dict, full_object: bool = False) -> Iterable[Mapping[str, Any]]:
"""
This is the wrapper for 'Submit > Check > Get' operation.
:: job_id - string with submited job_id EXAMPLE: '5a96ee43-e874-4a25-a9b4-004b39fe82a4'
for more information see: ZuoraSubmitJob
:: job_data_url - response object with:
- 'queryStatus': ["completed", "in_progress", "failed", "canceled", "aborted"],
- 'errorMessage': if there in any error on the server side during job execution
- 'dataFile': if the execution was succesfull returns URL for jsonl file
for more information see: ZuoraJobStatusCheck
:: ZuoraGetJobResult - reads the 'dataFile' URL and outputs the data records for completed job
for more information see: ZuoraGetJobResult
:: full_object - boolean, indicates whether to fetch the whole object without any filtering, default `False`
"""
if full_object:
# If the cursor is not available, we fetch whole object
job_query = self.query(stream_name=self.name, full_object=True)
else:
# Default prepared job with Cursor
job_query = self.query(stream_name=self.name, cursor_field=self.cursor_field, date_slice=date_slice)
job_id: List[str] = ZuoraSubmitJob(job_query, config).read_records(sync_mode=None)
job_data_url: List = ZuoraJobStatusCheck(list(job_id)[0], config).read_records(sync_mode=None)
yield from ZuoraGetJobResult(list(job_data_url)[0]).read_records(sync_mode=None)
def _send_request(self, request: requests.PreparedRequest, request_kwargs: Mapping[str, Any]) -> requests.Response:
"""
Override for _send_request CDK method to send HTTP request to the Zuora API
"""
try:
# try to fetch with default cursor_field = UpdatedDate
yield from self.get_zuora_data(date_slice=request_kwargs, config=self._config)
except ZOQLQueryCannotProcessObject:
# do nothing if we cannot resolve the object
pass
except ZOQLQueryFieldCannotResolveCursor:
"""
The default cursor_field is "updateddate" sometimes it's not supported by certain streams.
We need to swith the default cursor field to alternative one, and retry again the whole operation, submit the new job to the server.
We also need to save the state in the end of the sync.
So this switch is needed as fast and easy way of resolving the cursor_field for streams that support only the "createddate"
"""
# cursor_field switch to alternative = CreatedDate
self.cursor_field = self.alt_cursor_field
try:
"""
The alternative cursor_field is "createddate", it could be also not available for some custom objects.
In this case, we fetch the whole object without any filtering.
"""
# retry the whole operation with alternative cursor
yield from self.get_zuora_data(date_slice=request_kwargs, config=self._config)
except ZOQLQueryFieldCannotResolveAltCursor:
# if we fail to use the alternative cursor - fetch the whole object
# retry the whole operation
yield from self.get_zuora_data(date_slice=request_kwargs, config=self._config, full_object=True)
except ZOQLQueryCannotProcessObject:
# do nothing if we cannot resolve the object
pass
def parse_response(self, response: requests.Response, **kwargs) -> str:
yield from response
class ZuoraObjectsBase(ZuoraBase):
"""
Main class for all the Zuora data streams (Zuora Object names),
provides functionality for dynamically created classes as streams of data.
"""
@property
def state_checkpoint_interval(self) -> float:
return self.window_in_days
@staticmethod
def to_datetime_str(date: datetime) -> str:
"""
Custom method.
Returns the formated datetime string in a way Zuora API endpoint recognises it as timestamp.
:: Output example: '2021-07-15 07:45:55 -07:00' FROMAT : "%Y-%m-%d %H:%M:%S.%f %Z"
"""
return date.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
def get_cursor_from_schema(self, schema: Dict) -> str:
"""
Get the cursor_field from the stream's schema rather that take it from the class attribute
If the stream doesn't support 'updateddate', then we use 'createddate'.
If the stream doesn't support 'createddate', then stream is `full_refresh` only.
"""
if self.cursor_field in schema:
# when UpdatedDate is availalbe
return self.cursor_field
elif self.alt_cursor_field in schema:
# when CreatedDate is availalbe
return self.alt_cursor_field
else:
return None
def get_json_schema(self) -> Mapping[str, Any]:
"""
Override get_json_schema CDK method to retrieve the schema information for Zuora Object dynamicaly.
"""
schema = list(ZuoraDescribeObject(self.name, config=self._config).read_records(sync_mode=None))
return {"type": "object", "properties": {key: d[key] for d in schema for key in d}}
def as_airbyte_stream(self) -> AirbyteStream:
"""
Override as_airbyte_stream CDK method to replace default 'default_cursor_field' behaviour,
:: We use the cursor_field defined inside schema instead of using class attribute by default.
:: But we still need the default class attribute 'cursor_field' in order to CDK read_records works properly.
"""
stream = super().as_airbyte_stream()
stream_cursor = self.get_cursor_from_schema(stream.json_schema["properties"])
if stream_cursor:
stream.default_cursor_field = [stream_cursor]
else:
# When there is no cursor available in the stream, we do Full-Refresh only.
stream.supported_sync_modes = [SyncMode.full_refresh]
stream.source_defined_cursor = True # default CDK for full-refresh
stream.default_cursor_field = [] # default CDK for full-refresh
return stream
def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:
"""
Update the state value, default CDK method.
"""
updated_state = max(latest_record.get(self.cursor_field, ""), current_stream_state.get(self.cursor_field, ""))
return {self.cursor_field: updated_state} if updated_state else {}
def query(self, stream_name: str, cursor_field: str = None, date_slice: Dict = None, full_object: bool = False) -> str:
"""
Custom method. Returns the SQL-like query in a way Zuora API endpoint accepts the jobs.
"""
if full_object:
return f"""select * from {stream_name}"""
return f"""
select *
from {stream_name} where
{cursor_field} >= TIMESTAMP '{date_slice.get('start_date')}' and
{cursor_field} <= TIMESTAMP '{date_slice.get('end_date')}'
order by {cursor_field} asc
"""
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
"""
Override default stream_slices CDK method to provide date_slices as page chunks for data fetch.
Returns list of dict, example: {
"start_date": "2020-01-01 00:00:00 -07:00",
"end_date": "2021-12-31 00:00:00 -07:00"
},
...
"""
start_date = pendulum.parse(self._config["start_date"]).astimezone()
end_date = pendulum.now().astimezone()
# Determine stream_state, if no stream_state we use start_date
if stream_state:
state = stream_state.get(self.cursor_field, stream_state.get(self.alt_cursor_field))
start_date = pendulum.parse(state) if state else self._config["start_date"]
# use the lowest date between start_date and self.end_date, otherwise API fails if start_date is in future
start_date = min(start_date, end_date)
while start_date <= end_date:
end_date_slice = start_date.add(days=self.window_in_days)
yield {"start_date": self.to_datetime_str(start_date), "end_date": self.to_datetime_str(end_date_slice)}
start_date = end_date_slice
class ZuoraListObjects(ZuoraBase):
"""
Provides functionality to retrieve the list of Zuora Objects as list of object names.
"""
def query(self, **kwargs) -> str:
return "SHOW TABLES"
def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
return [name["Table"] for name in response]
class ZuoraDescribeObject(ZuoraBase):
"""
Provides functionality to retrive Zuora Object (stream) schema dynamicaly from the endpoint,
convert it into JSONSchema types, for the connector's catalog.
"""
def __init__(self, zuora_object_name: str, config: Dict):
super().__init__(config)
self.zuora_object_name = zuora_object_name
def query(self, **kwargs) -> str:
return f"DESCRIBE {self.zuora_object_name}"
def parse_response(self, response: requests.Response, **kwargs) -> List[Dict]:
"""
Response example:
[
{'Column': 'taxexempteffectivedate', 'Type': 'date', 'Extra': '', 'Comment': 'TaxExemptEffectiveDate'},
{'Column': 'invoicetemplateid', 'Type': 'varchar', 'Extra': '', 'Comment': 'InvoiceTemplateId'}...
]
"""
type_number = ["number", "null"]
type_string = ["string", "null"]
type_object = ["object", "null"]
type_array = ["array", "null"]
type_bool = ["boolean", "null"]
type_mapping = {
"decimal(22,9)": type_number,
"decimal": type_number,
"integer": type_number,
"int": type_number,
"bigint": type_number,
"smallint": type_number,
"double": type_number,
"float": type_number,
"timestamp": type_number,
"date": type_string,
"datetime": type_string,
"timestamp with time zone": type_string,
"picklist": type_string,
"text": type_string,
"varchar": type_string,
"zoql": type_object,
"binary": type_object,
"json": type_object,
"xml": type_object,
"blob": type_object,
"list": type_array,
"array": type_array,
"boolean": type_bool,
"bool": type_bool,
}
json_schema = {}
for field in response:
json_type = type_mapping.get(field.get("Type"), type_string)
json_schema[field.get("Column")] = {"type": json_type}
return [json_schema]
class ZuoraSubmitJob(ZuoraStream):
"""
Provides functionality to submit ZOQL Data Query job on the server.
Return job_id as comfirmation of the successfully submited job.
"""
http_method = "POST"
def __init__(self, query: str, config: Dict):
super().__init__(config)
self.query = query
def path(self, **kwargs) -> str:
return "/query/jobs"
def request_body_json(self, **kwargs) -> Optional[Mapping]:
"""
Override of default CDK method to return SQL-like query and use it in _send_request method.
"""
params = self.base_query_params()
params["query"] = self.query
return params
def parse_response(self, response: requests.Response, **kwargs) -> List[str]:
"""
Response example:
{'data':
{
'id': 'c6f25f91-5357-4fec-a00d-9009cc1ae856',
'query': 'DESCRIBE account', # This could be SELECT statement or DESCRIBE or SHOW {object}
'useIndexJoin': False,
'sourceData': 'LIVE',
'queryStatus': 'accepted',
'remainingRetries': 3,
'retries': 3,
'updatedOn': '2021-07-26T15:33:48.287Z',
'createdBy': '84f78cea-8a5b-4332-933f-27439fe3b87b'
}
}
"""
return [response.json()["data"]["id"]]
class ZuoraJobStatusCheck(ZuoraStream):
"""
Provedes functionaluty to check the status of submited job on the server.
:: There are ["completed", "in_progress", "failed", "canceled", "aborted"] statuses available in check response.
The check operation returns either dataFile URL or error message describing the error.
"""
def __init__(self, job_id: str, config: Dict):
super().__init__(config)
self.job_id = job_id
def path(self, **kwargs) -> str:
return f"/query/jobs/{self.job_id}"
def parse_response(self, response: requests.Response, **kwargs) -> List[str]:
return [response.json()["data"]["dataFile"]]
def _send_request(self, request: requests.PreparedRequest, request_kwargs: Mapping[str, Any]) -> requests.Response:
"""
Override of default CDK method _send_request to check the status of submited job iteratevely,
until it's either "completed" or "failed" or "canceled" for any reason.
Response example:
{'data':
{
'id': 'c6f25f91-5357-4fec-a00d-9009cc1ae856',
'query': 'DESCRIBE account',
'useIndexJoin': False,
'sourceData': 'LIVE',
'queryStatus': 'completed',
'dataFile': 'https://owl-auw2-sbx01-query-result.s3.us-west-2.amazonaws.com/c6f25f91-5357-4fec-a00d-9009cc1ae856_2779514650704989.jsonl?....',
'outputRows': 53,
'processingTime': 516,
'remainingRetries': 3,
'retries': 3,
'updatedOn': '2021-07-26T15:33:48.803Z',
'createdBy': '84f78cea-8a5b-4332-933f-27439fe3b87b'
}
}
"""
# Define the job error statuses
errors = ["failed", "canceled", "aborted"]
# Error msg: the cursor_field cannot be resolved
cursor_error = f"Column '{self.cursor_field}' cannot be resolved"
alt_cursor_error = f"Column '{self.alt_cursor_field}' cannot be resolved"
# Error msg: cannot process object
obj_read_error = "failed to process object"
status = None
success = "completed"
while status != success:
"""
There is no opportunity for the infinity loop because the operation performs on the server-side,
there are query run-time limitations: if the query time is longer than 120 min,
the server will output the error with the corresponding message for the user in the output,
by raising `ZOQLQueryFailed` exception.
"""
response: requests.Response = self._session.send(request, **request_kwargs)
job_check = response.json()
status = job_check["data"]["queryStatus"]
if status in errors and cursor_error in job_check["data"]["errorMessage"]:
raise ZOQLQueryFieldCannotResolveCursor
elif status in errors and obj_read_error in job_check["data"]["errorMessage"]:
raise ZOQLQueryCannotProcessObject
elif status in errors and alt_cursor_error in job_check["data"]["errorMessage"]:
raise ZOQLQueryFieldCannotResolveAltCursor
elif status in errors:
raise ZOQLQueryFailed(response)
return response
class ZuoraGetJobResult(HttpStream):
"""
Provides functionality to retrive the records from the file formed by submited and successfully completed job
DataFile URL example:
{'data':
{
'id': 'c6f25f91-5357-4fec-a00d-9009cc1ae856',
...,
...,
'dataFile': 'https://owl-auw2-sbx01-query-result.s3.us-west-2.amazonaws.com/c6f25f91-5357-4fec-a00d-9009cc1ae856_2779514650704989.jsonl?....',
...,
...,
'createdBy': '84f78cea-8a5b-4332-933f-27439fe3b87b'
}
}
"""
primary_key = None
def __init__(self, url: str):
super().__init__() # initiate authenticator = NoAuth(), _session
self.url = url # accept incoming dataFile URL
@property
def url_base(self):
return self.url
def path(self, **kwargs) -> str:
"""Abstractmethod HTTPStream CDK dependency"""
return ""
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
"""Abstractmethod HTTPStream CDK dependency"""
return None
def parse_response(self, response: requests.Response, **kwargs) -> str:
"""
Return records from JSONLines file from dataFile URL.
"""
for line in response.text.splitlines():
yield json.loads(line)
class SourceZuora(AbstractSource):
def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, any]:
"""
Testing connection availability for the connector by granting the token.
"""
auth = ZuoraAuthenticator(config).get_auth()
try:
auth.get_auth_header()
return True, None
except Exception as e:
return False, e
def streams(self, config: Mapping[str, Any]) -> List[ZuoraStream]:
"""
Mapping a input config of the user input configuration as defined in the connector spec.
Defining streams to run by building stream classes dynamically.
"""
auth = ZuoraAuthenticator(config)
config["authenticator"] = auth.get_auth()
config["url_base"] = auth.url_base
# List available objects (streams) names from Zuora
# Example: zuora_stream_names = ["account", "country", "user"]
zuora_stream_names = ZuoraListObjects(config).read_records(sync_mode=None)
streams: List[ZuoraStream] = []
for stream_name in zuora_stream_names:
if stream_name not in ZUORA_EXCLUDED_STREAMS:
# construct ZuoraReadStreams sub-class for each stream_name
stream_class = type(stream_name, (ZuoraObjectsBase,), {})
# instancetiate a stream with config
stream_instance = stream_class(config)
streams.append(stream_instance)
return streams