airbyte/airbyte-integrations/connectors/source-microsoft-dataverse/source_microsoft_dataverse/streams.py

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from abc import ABC
from datetime import datetime
from typing import Any, Iterable, Mapping, MutableMapping, Optional
from urllib import parse

import requests
from airbyte_cdk.sources.streams import IncrementalMixin
from airbyte_cdk.sources.streams.http import HttpStream


# Basic full refresh stream
class MicrosoftDataverseStream(HttpStream, ABC):

    # Base url will be set by init(), using information provided by the user through config input
    url_base = ""
    primary_key = ""

    def __init__(self, url, stream_name, stream_path, schema, primary_key, odata_maxpagesize, **kwargs):
        super().__init__(**kwargs)
        self.url_base = url + "/api/data/v9.2/"
        self.stream_name = stream_name
        self.stream_path = stream_path
        self.primary_key = primary_key
        self.schema = schema
        self.odata_maxpagesize = odata_maxpagesize

    @property
    def name(self) -> str:
        """Source name"""
        return self.stream_name

    def get_json_schema(self) -> Mapping[str, Any]:
        return self.schema

    def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
        """
        :param response: the most recent response from the API
        :return If there is another page in the result, a mapping (e.g: dict) containing information needed to query the next page in the response.
                If there are no more pages in the result, return None.
        """

        response_json = response.json()

        if "@odata.nextLink" in response_json:
            next_link = response_json["@odata.nextLink"]
            next_link_params = dict(parse.parse_qsl(parse.urlsplit(next_link).query))
            return next_link_params
        else:
            return None

    def request_params(
        self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None
    ) -> MutableMapping[str, Any]:
        """
        :return a dict containing the parameters to be used in the request
        """
        request_params = super().request_params(stream_state)
        # If there is not a nextLink(contains "next_page_token") in the response, means it is the last page.
        # In this case, the deltatoken is passed instead.
        if next_page_token is None:
            request_params.update(stream_state)
            return request_params
        elif next_page_token is not None:
            request_params.update(next_page_token)
            return request_params

    def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
        """
        :return an iterable containing each record in the response
        """
        for result in response.json()["value"]:
            yield result

    def request_headers(
        self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
    ) -> Mapping[str, Any]:
        return {
            "Cache-Control": "no-cache",
            "OData-Version": "4.0",
            "Content-Type": "application/json",
            "Prefer": "odata.maxpagesize=" + str(self.odata_maxpagesize),
        }

    def path(
        self,
        *,
        stream_state: Mapping[str, Any] = None,
        stream_slice: Mapping[str, Any] = None,
        next_page_token: Mapping[str, Any] = None,
    ) -> str:
        return self.stream_path


# Basic incremental stream
class IncrementalMicrosoftDataverseStream(MicrosoftDataverseStream, IncrementalMixin, ABC):

    delta_token_field = "$deltatoken"
    state_checkpoint_interval = None  # For now we just use the change tracking as state, and it is only emitted on last page

    def __init__(self, url, stream_name, stream_path, schema, primary_key, odata_maxpagesize, config_cursor_field, **kwargs):
        super().__init__(url, stream_name, stream_path, schema, primary_key, odata_maxpagesize, **kwargs)
        self._cursor_value = None
        self.config_cursor_field = config_cursor_field

    @property
    def state(self) -> Mapping[str, Any]:
        return {self.delta_token_field: str(self._cursor_value)}

    @property
    def cursor_field(self) -> str:
        return self.config_cursor_field

    # Sets the state got by state getter. "value" is the return of state getter -> dict
    @state.setter
    def state(self, value: Mapping[str, Any]):
        self._cursor_value = value[self.delta_token_field]

    def request_headers(
        self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
    ) -> Mapping[str, Any]:
        """
        Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
        """
        request_headers = super().request_headers(stream_state=stream_state)
        request_headers.update(
            {"Prefer": "odata.track-changes," + request_headers["Prefer"]}
        )  # odata.track-changes -> Header that enables change tracking
        return request_headers

    def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
        response_json = response.json()
        if "@odata.deltaLink" in response_json:
            delta_link = response_json["@odata.deltaLink"]
            delta_link_params = dict(parse.parse_qsl(parse.urlsplit(delta_link).query))
            self._cursor_value = delta_link_params[self.delta_token_field]
        for result in response_json["value"]:
            if "@odata.context" in result and result["reason"] == "deleted":
                result.update({self.primary_key[0][0]: result["id"]})
                result.pop("@odata.context", None)
                result.pop("id", None)
                result.pop("reason", None)
                now = datetime.now().isoformat()
                result.update({self.cursor_field[0]: now})
                result.update({"_ab_cdc_deleted_at": now})
            else:
                result.update({"_ab_cdc_updated_at": result[self.cursor_field[0]]})

            yield result