# # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # import time from abc import ABC, abstractmethod from typing import Any, Iterable, List, Mapping, MutableMapping, Optional from urllib import parse import pendulum import requests from airbyte_cdk.models import SyncMode from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.sources.streams.http.exceptions import DefaultBackoffException from requests.exceptions import HTTPError from . import constants from .graphql import ( CursorStorage, QueryReactions, get_query_issue_reactions, get_query_projectsV2, get_query_pull_requests, get_query_reviews, ) from .utils import getter class GithubStreamABC(HttpStream, ABC): primary_key = "id" # Detect streams with high API load large_stream = False stream_base_params = {} def __init__(self, api_url: str = "https://api.github.com", access_token_type: str = "", **kwargs): super().__init__(**kwargs) self.access_token_type = access_token_type self.api_url = api_url @property def url_base(self) -> str: return self.api_url @property def availability_strategy(self) -> Optional["AvailabilityStrategy"]: return None def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: links = response.links if "next" in links: next_link = links["next"]["url"] parsed_link = parse.urlparse(next_link) page = dict(parse.parse_qsl(parsed_link.query)).get("page") return {"page": page} def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> MutableMapping[str, Any]: params = {"per_page": self.page_size} if next_page_token: params.update(next_page_token) params.update(self.stream_base_params) return params def request_headers(self, **kwargs) -> Mapping[str, Any]: # Without sending `User-Agent` header we will be getting `403 Client Error: Forbidden for url` error. return {"User-Agent": "PostmanRuntime/7.28.0"} def parse_response( self, response: requests.Response, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Iterable[Mapping]: for record in response.json(): # GitHub puts records in an array. yield self.transform(record=record, stream_slice=stream_slice) def should_retry(self, response: requests.Response) -> bool: if super().should_retry(response): return True retry_flag = ( # The GitHub GraphQL API has limitations # https://docs.github.com/en/graphql/overview/resource-limitations (response.headers.get("X-RateLimit-Resource") == "graphql" and self.check_graphql_rate_limited(response.json())) # Rate limit HTTP headers # https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limit-http-headers or (response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0") # Secondary rate limits # https://docs.github.com/en/rest/overview/resources-in-the-rest-api#secondary-rate-limits or "Retry-After" in response.headers ) if retry_flag: headers = [ "X-RateLimit-Resource", "X-RateLimit-Remaining", "X-RateLimit-Reset", "X-RateLimit-Limit", "X-RateLimit-Used", "Retry-After", ] headers = ", ".join([f"{h}: {response.headers[h]}" for h in headers if h in response.headers]) if headers: headers = f"HTTP headers: {headers}," self.logger.info( f"Rate limit handling for stream `{self.name}` for the response with {response.status_code} status code, {headers} with message: {response.text}" ) return retry_flag def backoff_time(self, response: requests.Response) -> Optional[float]: # This method is called if we run into the rate limit. GitHub limits requests to 5000 per hour and provides # `X-RateLimit-Reset` header which contains time when this hour will be finished and limits will be reset so # we again could have 5000 per another hour. min_backoff_time = 60.0 retry_after = response.headers.get("Retry-After") if retry_after is not None: return max(float(retry_after), min_backoff_time) reset_time = response.headers.get("X-RateLimit-Reset") if reset_time: return max(float(reset_time) - time.time(), min_backoff_time) def check_graphql_rate_limited(self, response_json) -> bool: errors = response_json.get("errors") if errors: for error in errors: if error.get("type") == "RATE_LIMITED": return True return False def read_records(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: # get out the stream_slice parts for later use. organisation = stream_slice.get("organization", "") repository = stream_slice.get("repository", "") # Reading records while handling the errors try: yield from super().read_records(stream_slice=stream_slice, **kwargs) except HTTPError as e: # This whole try/except situation in `read_records()` isn't good but right now in `self._send_request()` # function we have `response.raise_for_status()` so we don't have much choice on how to handle errors. # Bocked on https://github.com/airbytehq/airbyte/issues/3514. if e.response.status_code == requests.codes.NOT_FOUND: # A lot of streams are not available for repositories owned by a user instead of an organization. if isinstance(self, Organizations): error_msg = f"Syncing `{self.__class__.__name__}` stream isn't available for organization `{organisation}`." elif isinstance(self, TeamMemberships): error_msg = f"Syncing `{self.__class__.__name__}` stream for organization `{organisation}`, team `{stream_slice.get('team_slug')}` and user `{stream_slice.get('username')}` isn't available: User has no team membership. Skipping..." else: error_msg = f"Syncing `{self.__class__.__name__}` stream isn't available for repository `{repository}`." elif e.response.status_code == requests.codes.FORBIDDEN: error_msg = str(e.response.json().get("message")) # When using the `check_connection` method, we should raise an error if we do not have access to the repository. if isinstance(self, Repositories): raise e # When `403` for the stream, that has no access to the organization's teams, based on OAuth Apps Restrictions: # https://docs.github.com/en/organizations/restricting-access-to-your-organizations-data/enabling-oauth-app-access-restrictions-for-your-organization # For all `Organisation` based streams elif isinstance(self, Organizations) or isinstance(self, Teams) or isinstance(self, Users): error_msg = ( f"Syncing `{self.name}` stream isn't available for organization `{organisation}`. Full error message: {error_msg}" ) # For all other `Repository` base streams else: error_msg = ( f"Syncing `{self.name}` stream isn't available for repository `{repository}`. Full error message: {error_msg}" ) elif e.response.status_code == requests.codes.UNAUTHORIZED: if self.access_token_type == constants.PERSONAL_ACCESS_TOKEN_TITLE: error_msg = str(e.response.json().get("message")) self.logger.error(f"{self.access_token_type} renewal is required: {error_msg}") raise e elif e.response.status_code == requests.codes.GONE and isinstance(self, Projects): # Some repos don't have projects enabled and we we get "410 Client Error: Gone for # url: https://api.github.com/repos/xyz/projects?per_page=100" error. error_msg = f"Syncing `Projects` stream isn't available for repository `{stream_slice['repository']}`." elif e.response.status_code == requests.codes.CONFLICT: error_msg = ( f"Syncing `{self.name}` stream isn't available for repository " f"`{stream_slice['repository']}`, it seems like this repository is empty." ) elif e.response.status_code == requests.codes.SERVER_ERROR and isinstance(self, WorkflowRuns): error_msg = f"Syncing `{self.name}` stream isn't available for repository `{stream_slice['repository']}`." elif e.response.status_code == requests.codes.BAD_GATEWAY: error_msg = f"Stream {self.name} temporary failed. Try to re-run sync later" else: # most probably here we're facing a 500 server error and a risk to get a non-json response, so lets output response.text self.logger.error(f"Undefined error while reading records: {e.response.text}") raise e self.logger.warning(error_msg) class GithubStream(GithubStreamABC): def __init__(self, repositories: List[str], page_size_for_large_streams: int, **kwargs): super().__init__(**kwargs) self.repositories = repositories # GitHub pagination could be from 1 to 100. # This parameter is deprecated and in future will be used sane default, page_size: 10 self.page_size = page_size_for_large_streams if self.large_stream else constants.DEFAULT_PAGE_SIZE def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/{self.name}" def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: for repository in self.repositories: yield {"repository": repository} def get_error_display_message(self, exception: BaseException) -> Optional[str]: if ( isinstance(exception, DefaultBackoffException) and exception.response.status_code == requests.codes.BAD_GATEWAY and self.large_stream and self.page_size > 1 ): return f'Please try to decrease the "Page size for large streams" below {self.page_size}. The stream "{self.name}" is a large stream, such streams can fail with 502 for high "page_size" values.' return super().get_error_display_message(exception) def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record["repository"] = stream_slice["repository"] return record class SemiIncrementalMixin: """ Semi incremental streams are also incremental but with one difference, they: - read all records; - output only new records. This means that semi incremental streams read all records (like full_refresh streams) but do filtering directly in the code and output only latest records (like incremental streams). """ cursor_field = "updated_at" # This flag is used to indicate that current stream supports `sort` and `direction` request parameters and that # we should break processing records if possible. If `sort` is set to `updated` and `direction` is set to `desc` # this means that latest records will be at the beginning of the response and after we processed those latest # records we can just stop and not process other record. This will increase speed of each incremental stream # which supports those 2 request parameters. Currently only `IssueMilestones` and `PullRequests` streams are # supporting this. is_sorted = False def __init__(self, start_date: str = "", **kwargs): super().__init__(**kwargs) self._start_date = start_date self._starting_point_cache = {} @property def slice_keys(self): if hasattr(self, "repositories"): return ["repository"] return ["organization"] record_slice_key = slice_keys def convert_cursor_value(self, value): return value @property def state_checkpoint_interval(self) -> Optional[int]: if self.is_sorted == "asc": return self.page_size def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): """ Return the latest state by comparing the cursor value in the latest record with the stream's most recent state object and returning an updated state object. """ slice_value = getter(latest_record, self.record_slice_key) updated_state = self.convert_cursor_value(latest_record[self.cursor_field]) stream_state_value = current_stream_state.get(slice_value, {}).get(self.cursor_field) if stream_state_value: updated_state = max(updated_state, stream_state_value) current_stream_state.setdefault(slice_value, {})[self.cursor_field] = updated_state return current_stream_state def _get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str: if stream_state: state_path = [stream_slice[k] for k in self.slice_keys] + [self.cursor_field] stream_state_value = getter(stream_state, state_path, strict=False) if stream_state_value: return max(self._start_date, stream_state_value) return self._start_date def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str: cache_key = tuple([stream_slice[k] for k in self.slice_keys]) if cache_key not in self._starting_point_cache: self._starting_point_cache[cache_key] = self._get_starting_point(stream_state, stream_slice) return self._starting_point_cache[cache_key] def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: start_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) for record in super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): cursor_value = self.convert_cursor_value(record[self.cursor_field]) if cursor_value > start_point: yield record elif self.is_sorted == "desc" and cursor_value < start_point: break def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: self._starting_point_cache.clear() yield from super().stream_slices(**kwargs) class IncrementalMixin(SemiIncrementalMixin): def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]: params = super().request_params(stream_state=stream_state, **kwargs) since_params = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) if since_params: params["since"] = since_params return params # Below are full refresh streams class RepositoryStats(GithubStream): """ This stream is technical and not intended for the user, we use it for checking connection with the repository. API docs: https://docs.github.com/en/rest/reference/repos#get-a-repository """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: yield response.json() class Assignees(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-assignees """ class Branches(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-branches """ primary_key = ["repository", "name"] def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/branches" class Collaborators(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-repository-collaborators """ class IssueLabels(GithubStream): """ API docs: https://docs.github.com/en/rest/issues/labels#list-labels-for-a-repository """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/labels" class Organizations(GithubStreamABC): """ API docs: https://docs.github.com/en/rest/reference/orgs#get-an-organization """ # GitHub pagination could be from 1 to 100. page_size = 100 def __init__(self, organizations: List[str], access_token_type: str = "", **kwargs): super().__init__(**kwargs) self.organizations = organizations self.access_token_type = access_token_type def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: for organization in self.organizations: yield {"organization": organization} def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}" def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: yield response.json() def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record["organization"] = stream_slice["organization"] return record class Repositories(SemiIncrementalMixin, Organizations): """ API docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories """ is_sorted = "desc" stream_base_params = { "sort": "updated", "direction": "desc", } def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/repos" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: for record in response.json(): # GitHub puts records in an array. yield self.transform(record=record, stream_slice=stream_slice) class Tags(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-repository-tags """ primary_key = ["repository", "name"] def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/tags" class Teams(Organizations): """ API docs: https://docs.github.com/en/rest/reference/teams#list-teams """ use_cache = True def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/teams" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: for record in response.json(): yield self.transform(record=record, stream_slice=stream_slice) class Users(Organizations): """ API docs: https://docs.github.com/en/rest/reference/orgs#list-organization-members """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/members" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: for record in response.json(): yield self.transform(record=record, stream_slice=stream_slice) # Below are semi incremental streams class Releases(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-releases """ cursor_field = "created_at" def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) assets = record.get("assets", []) for asset in assets: uploader = asset.pop("uploader", None) asset["uploader_id"] = uploader.get("id") if uploader else None return record class Events(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/activity#list-repository-events """ cursor_field = "created_at" class PullRequests(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/pulls/pulls#list-pull-requests """ use_cache = True large_stream = True def __init__(self, **kwargs): super().__init__(**kwargs) self._first_read = True def read_records(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: """ Decide if this a first read or not by the presence of the state object """ self._first_read = not bool(stream_state) yield from super().read_records(stream_state=stream_state, **kwargs) def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/pulls" def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) for nested in ("head", "base"): entry = record.get(nested, {}) entry["repo_id"] = (record.get("head", {}).pop("repo", {}) or {}).get("id") return record def request_params(self, **kwargs) -> MutableMapping[str, Any]: base_params = super().request_params(**kwargs) # The very first time we read this stream we want to read ascending so we can save state in case of # a halfway failure. But if there is state, we read descending to allow incremental behavior. params = {"state": "all", "sort": "updated", "direction": self.is_sorted} return {**base_params, **params} @property def is_sorted(self) -> str: """ Depending if there any state we read stream in ascending or descending order. """ if self._first_read: return "asc" return "desc" class CommitComments(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository """ use_cache = True def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/comments" class IssueMilestones(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-milestones """ is_sorted = "desc" stream_base_params = { "state": "all", "sort": "updated", "direction": "desc", } def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/milestones" class Stargazers(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/activity#list-stargazers """ primary_key = "user_id" cursor_field = "starred_at" def request_headers(self, **kwargs) -> Mapping[str, Any]: base_headers = super().request_headers(**kwargs) # We need to send below header if we want to get `starred_at` field. See docs (Alternative response with # star creation timestamps) - https://docs.github.com/en/rest/reference/activity#list-stargazers. headers = {"Accept": "application/vnd.github.v3.star+json"} return {**base_headers, **headers} def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: """ We need to provide the "user_id" for the primary_key attribute and don't remove the whole "user" block from the record. """ record = super().transform(record=record, stream_slice=stream_slice) record["user_id"] = record.get("user").get("id") return record class Projects(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/projects#list-repository-projects """ use_cache = True stream_base_params = { "state": "all", } def request_headers(self, **kwargs) -> Mapping[str, Any]: base_headers = super().request_headers(**kwargs) # Projects stream requires sending following `Accept` header. If we won't sent it # we'll get `415 Client Error: Unsupported Media Type` error. headers = {"Accept": "application/vnd.github.inertia-preview+json"} return {**base_headers, **headers} class IssueEvents(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-issue-events-for-a-repository """ cursor_field = "created_at" def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/issues/events" # Below are incremental streams class Comments(IncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository """ use_cache = True large_stream = True max_retries = 7 def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/issues/comments" class Commits(IncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-commits Pull commits from each branch of each repository, tracking state for each branch """ primary_key = "sha" cursor_field = "created_at" slice_keys = ["repository", "branch"] def __init__(self, branches_to_pull: Mapping[str, List[str]], default_branches: Mapping[str, str], **kwargs): super().__init__(**kwargs) self.branches_to_pull = branches_to_pull self.default_branches = default_branches def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]: params = super(IncrementalMixin, self).request_params(stream_state=stream_state, stream_slice=stream_slice, **kwargs) params["since"] = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) params["sha"] = stream_slice["branch"] return params def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: for stream_slice in super().stream_slices(**kwargs): repository = stream_slice["repository"] for branch in self.branches_to_pull.get(repository, []): yield {"branch": branch, "repository": repository} def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) # Record of the `commits` stream doesn't have an updated_at/created_at field at the top level (so we could # just write `record["updated_at"]` or `record["created_at"]`). Instead each record has such value in # `commit.author.date`. So the easiest way is to just enrich the record returned from API with top level # field `created_at` and use it as cursor_field. # Include the branch in the record record["created_at"] = record["commit"]["author"]["date"] record["branch"] = stream_slice["branch"] return record def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): repository = latest_record["repository"] branch = latest_record["branch"] updated_state = latest_record[self.cursor_field] stream_state_value = current_stream_state.get(repository, {}).get(branch, {}).get(self.cursor_field) if stream_state_value: updated_state = max(updated_state, stream_state_value) current_stream_state.setdefault(repository, {}).setdefault(branch, {})[self.cursor_field] = updated_state return current_stream_state class Issues(IncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/issues/issues#list-repository-issues """ use_cache = True large_stream = True is_sorted = "asc" stream_base_params = { "state": "all", "sort": "updated", "direction": "asc", } class ReviewComments(IncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/reference/pulls#list-review-comments-in-a-repository """ use_cache = True large_stream = True def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/pulls/comments" class GitHubGraphQLStream(GithubStream, ABC): http_method = "POST" def path( self, *, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> str: return "graphql" def should_retry(self, response: requests.Response) -> bool: return True if response.json().get("errors") else super().should_retry(response) def _get_repository_name(self, repository: Mapping[str, Any]) -> str: return repository["owner"]["login"] + "/" + repository["name"] def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> MutableMapping[str, Any]: return {} class PullRequestStats(SemiIncrementalMixin, GitHubGraphQLStream): """ API docs: https://docs.github.com/en/graphql/reference/objects#pullrequest """ large_stream = True is_sorted = "asc" def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: repository = response.json()["data"]["repository"] if repository: nodes = repository["pullRequests"]["nodes"] for record in nodes: record["review_comments"] = sum([node["comments"]["totalCount"] for node in record["review_comments"]["nodes"]]) record["comments"] = record["comments"]["totalCount"] record["commits"] = record["commits"]["totalCount"] record["repository"] = self._get_repository_name(repository) if record["merged_by"]: record["merged_by"]["type"] = record["merged_by"].pop("__typename") yield record def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: repository = response.json()["data"]["repository"] if repository: pageInfo = repository["pullRequests"]["pageInfo"] if pageInfo["hasNextPage"]: return {"after": pageInfo["endCursor"]} def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: organization, name = stream_slice["repository"].split("/") if next_page_token: next_page_token = next_page_token["after"] query = get_query_pull_requests( owner=organization, name=name, first=self.page_size, after=next_page_token, direction=self.is_sorted.upper() ) return {"query": query} def request_headers(self, **kwargs) -> Mapping[str, Any]: base_headers = super().request_headers(**kwargs) # https://docs.github.com/en/graphql/overview/schema-previews#merge-info-preview headers = {"Accept": "application/vnd.github.merge-info-preview+json"} return {**base_headers, **headers} class Reviews(SemiIncrementalMixin, GitHubGraphQLStream): """ API docs: https://docs.github.com/en/graphql/reference/objects#pullrequestreview """ is_sorted = False cursor_field = "updated_at" def __init__(self, **kwargs): super().__init__(**kwargs) self.pull_requests_cursor = {} self.reviews_cursors = {} def _get_records(self, pull_request, repository_name): "yield review records from pull_request" for record in pull_request["reviews"]["nodes"]: record["repository"] = repository_name record["pull_request_url"] = pull_request["url"] if record["commit"]: record["commit_id"] = record.pop("commit")["oid"] if record["user"]: record["user"]["type"] = record["user"].pop("__typename") # for backward compatibility with REST API response record["_links"] = { "html": {"href": record["html_url"]}, "pull_request": {"href": record["pull_request_url"]}, } yield record def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: repository = response.json()["data"]["repository"] if repository: repository_name = self._get_repository_name(repository) if "pullRequests" in repository: for pull_request in repository["pullRequests"]["nodes"]: yield from self._get_records(pull_request, repository_name) elif "pullRequest" in repository: yield from self._get_records(repository["pullRequest"], repository_name) def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: repository = response.json()["data"]["repository"] if repository: repository_name = self._get_repository_name(repository) reviews_cursors = self.reviews_cursors.setdefault(repository_name, {}) if "pullRequests" in repository: if repository["pullRequests"]["pageInfo"]["hasNextPage"]: self.pull_requests_cursor[repository_name] = repository["pullRequests"]["pageInfo"]["endCursor"] for pull_request in repository["pullRequests"]["nodes"]: if pull_request["reviews"]["pageInfo"]["hasNextPage"]: pull_request_number = pull_request["number"] reviews_cursors[pull_request_number] = pull_request["reviews"]["pageInfo"]["endCursor"] elif "pullRequest" in repository: if repository["pullRequest"]["reviews"]["pageInfo"]["hasNextPage"]: pull_request_number = repository["pullRequest"]["number"] reviews_cursors[pull_request_number] = repository["pullRequest"]["reviews"]["pageInfo"]["endCursor"] if reviews_cursors: number, after = reviews_cursors.popitem() return {"after": after, "number": number} if repository_name in self.pull_requests_cursor: return {"after": self.pull_requests_cursor.pop(repository_name)} def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: organization, name = stream_slice["repository"].split("/") if not next_page_token: next_page_token = {"after": None} query = get_query_reviews(owner=organization, name=name, first=self.page_size, **next_page_token) return {"query": query} class PullRequestCommits(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/pulls#list-commits-on-a-pull-request """ primary_key = "sha" def __init__(self, parent: HttpStream, **kwargs): super().__init__(**kwargs) self.parent = parent def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_number']}/commits" def stream_slices( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: parent_stream_slices = self.parent.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state ) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) for record in parent_records: yield {"repository": record["repository"], "pull_number": record["number"]} def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) record["pull_number"] = stream_slice["pull_number"] return record class ProjectsV2(SemiIncrementalMixin, GitHubGraphQLStream): """ API docs: https://docs.github.com/en/graphql/reference/objects#pullrequest """ is_sorted = "asc" def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: repository = response.json()["data"]["repository"] if repository: nodes = repository["projectsV2"]["nodes"] for record in nodes: record["owner_id"] = record.pop("owner").get("id") record["repository"] = self._get_repository_name(repository) yield record def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: repository = response.json()["data"]["repository"] if repository: page_info = repository["projectsV2"]["pageInfo"] if page_info["hasNextPage"]: return {"after": page_info["endCursor"]} def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: organization, name = stream_slice["repository"].split("/") if next_page_token: next_page_token = next_page_token["after"] query = get_query_projectsV2( owner=organization, name=name, first=self.page_size, after=next_page_token, direction=self.is_sorted.upper() ) return {"query": query} # Reactions streams class ReactionStream(GithubStream, ABC): parent_key = "id" copy_parent_key = "comment_id" cursor_field = "created_at" def __init__(self, start_date: str = "", **kwargs): super().__init__(**kwargs) kwargs["start_date"] = start_date self._parent_stream = self.parent_entity(**kwargs) self._start_date = start_date @property @abstractmethod def parent_entity(self): """ Specify the class of the parent stream for which receive reactions """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: parent_path = self._parent_stream.path(stream_slice=stream_slice, **kwargs) return f"{parent_path}/{stream_slice[self.copy_parent_key]}/reactions" def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: for stream_slice in super().stream_slices(**kwargs): for parent_record in self._parent_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice): yield {self.copy_parent_key: parent_record[self.parent_key], "repository": stream_slice["repository"]} def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): repository = latest_record["repository"] parent_id = str(latest_record[self.copy_parent_key]) updated_state = latest_record[self.cursor_field] stream_state_value = current_stream_state.get(repository, {}).get(parent_id, {}).get(self.cursor_field) if stream_state_value: updated_state = max(updated_state, stream_state_value) current_stream_state.setdefault(repository, {}).setdefault(parent_id, {})[self.cursor_field] = updated_state return current_stream_state def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str: if stream_state: repository = stream_slice["repository"] parent_id = str(stream_slice[self.copy_parent_key]) stream_state_value = stream_state.get(repository, {}).get(parent_id, {}).get(self.cursor_field) if stream_state_value: return max(self._start_date, stream_state_value) return self._start_date def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) for record in super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): if record[self.cursor_field] > starting_point: yield record def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record, stream_slice) record[self.copy_parent_key] = stream_slice[self.copy_parent_key] return record class CommitCommentReactions(ReactionStream): """ API docs: https://docs.github.com/en/rest/reference/reactions#list-reactions-for-a-commit-comment """ parent_entity = CommitComments class IssueCommentReactions(ReactionStream): """ API docs: https://docs.github.com/en/rest/reference/reactions#list-reactions-for-an-issue-comment """ parent_entity = Comments class IssueReactions(SemiIncrementalMixin, GitHubGraphQLStream): """ https://docs.github.com/en/graphql/reference/objects#issue https://docs.github.com/en/graphql/reference/objects#reaction """ cursor_field = "created_at" def __init__(self, **kwargs): super().__init__(**kwargs) self.issues_cursor = {} self.reactions_cursors = {} def _get_reactions_from_issue(self, issue, repository_name): for reaction in issue["reactions"]["nodes"]: reaction["repository"] = repository_name reaction["issue_number"] = issue["number"] reaction["user"]["type"] = "User" yield reaction def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: repository = response.json()["data"]["repository"] if repository: repository_name = self._get_repository_name(repository) if "issues" in repository: for issue in repository["issues"]["nodes"]: yield from self._get_reactions_from_issue(issue, repository_name) elif "issue" in repository: yield from self._get_reactions_from_issue(repository["issue"], repository_name) def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: repository = response.json()["data"]["repository"] if repository: repository_name = self._get_repository_name(repository) reactions_cursors = self.reactions_cursors.setdefault(repository_name, {}) if "issues" in repository: if repository["issues"]["pageInfo"]["hasNextPage"]: self.issues_cursor[repository_name] = repository["issues"]["pageInfo"]["endCursor"] for issue in repository["issues"]["nodes"]: if issue["reactions"]["pageInfo"]["hasNextPage"]: issue_number = issue["number"] reactions_cursors[issue_number] = issue["reactions"]["pageInfo"]["endCursor"] elif "issue" in repository: if repository["issue"]["reactions"]["pageInfo"]["hasNextPage"]: issue_number = repository["issue"]["number"] reactions_cursors[issue_number] = repository["issue"]["reactions"]["pageInfo"]["endCursor"] if reactions_cursors: number, after = reactions_cursors.popitem() return {"after": after, "number": number} if repository_name in self.issues_cursor: return {"after": self.issues_cursor.pop(repository_name)} def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: organization, name = stream_slice["repository"].split("/") if not next_page_token: next_page_token = {"after": None} query = get_query_issue_reactions(owner=organization, name=name, first=self.page_size, **next_page_token) return {"query": query} class PullRequestCommentReactions(SemiIncrementalMixin, GitHubGraphQLStream): """ API docs: https://docs.github.com/en/graphql/reference/objects#pullrequestreviewcomment https://docs.github.com/en/graphql/reference/objects#reaction """ cursor_field = "created_at" def __init__(self, **kwargs): super().__init__(**kwargs) self.cursor_storage = CursorStorage(["PullRequest", "PullRequestReview", "PullRequestReviewComment", "Reaction"]) self.query_reactions = QueryReactions() def _get_reactions_from_comment(self, comment, repository): for reaction in comment["reactions"]["nodes"]: reaction["repository"] = self._get_repository_name(repository) reaction["comment_id"] = comment["id"] if reaction["user"]: reaction["user"]["type"] = "User" yield reaction def _get_reactions_from_review(self, review, repository): for comment in review["comments"]["nodes"]: yield from self._get_reactions_from_comment(comment, repository) def _get_reactions_from_pull_request(self, pull_request, repository): for review in pull_request["reviews"]["nodes"]: yield from self._get_reactions_from_review(review, repository) def _get_reactions_from_repository(self, repository): for pull_request in repository["pullRequests"]["nodes"]: yield from self._get_reactions_from_pull_request(pull_request, repository) def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: data = response.json()["data"] repository = data.get("repository") if repository: yield from self._get_reactions_from_repository(repository) node = data.get("node") if node: if node["__typename"] == "PullRequest": yield from self._get_reactions_from_pull_request(node, node["repository"]) elif node["__typename"] == "PullRequestReview": yield from self._get_reactions_from_review(node, node["repository"]) elif node["__typename"] == "PullRequestReviewComment": yield from self._get_reactions_from_comment(node, node["repository"]) def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: data = response.json()["data"] repository = data.get("repository") if repository: self._add_cursor(repository, "pullRequests") for pull_request in repository["pullRequests"]["nodes"]: self._add_cursor(pull_request, "reviews") for review in pull_request["reviews"]["nodes"]: self._add_cursor(review, "comments") for comment in review["comments"]["nodes"]: self._add_cursor(comment, "reactions") node = data.get("node") if node: if node["__typename"] == "PullRequest": self._add_cursor(node, "reviews") for review in node["reviews"]["nodes"]: self._add_cursor(review, "comments") for comment in review["comments"]["nodes"]: self._add_cursor(comment, "reactions") elif node["__typename"] == "PullRequestReview": self._add_cursor(node, "comments") for comment in node["comments"]["nodes"]: self._add_cursor(comment, "reactions") elif node["__typename"] == "PullRequestReviewComment": self._add_cursor(node, "reactions") return self.cursor_storage.get_cursor() def _add_cursor(self, node, link): link_to_object = { "reactions": "Reaction", "comments": "PullRequestReviewComment", "reviews": "PullRequestReview", "pullRequests": "PullRequest", } pageInfo = node[link]["pageInfo"] if pageInfo["hasNextPage"]: self.cursor_storage.add_cursor( link_to_object[link], pageInfo["endCursor"], node[link]["totalCount"], parent_id=node.get("node_id") ) def request_body_json( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: organization, name = stream_slice["repository"].split("/") if next_page_token: after = next_page_token["cursor"] page_size = min(self.page_size, next_page_token["total_count"]) if next_page_token["typename"] == "PullRequest": query = self.query_reactions.get_query_root_repository(owner=organization, name=name, first=page_size, after=after) elif next_page_token["typename"] == "PullRequestReview": query = self.query_reactions.get_query_root_pull_request(node_id=next_page_token["parent_id"], first=page_size, after=after) elif next_page_token["typename"] == "PullRequestReviewComment": query = self.query_reactions.get_query_root_review(node_id=next_page_token["parent_id"], first=page_size, after=after) elif next_page_token["typename"] == "Reaction": query = self.query_reactions.get_query_root_comment(node_id=next_page_token["parent_id"], first=page_size, after=after) else: query = self.query_reactions.get_query_root_repository(owner=organization, name=name, first=self.page_size) return {"query": query} class Deployments(SemiIncrementalMixin, GithubStream): """ API docs: https://docs.github.com/en/rest/deployments/deployments#list-deployments """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/deployments" class ProjectColumns(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/projects#list-project-columns """ use_cache = True cursor_field = "updated_at" def __init__(self, parent: HttpStream, start_date: str, **kwargs): super().__init__(**kwargs) self.parent = parent self._start_date = start_date def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"projects/{stream_slice['project_id']}/columns" def stream_slices( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: parent_stream_slices = self.parent.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state ) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) for record in parent_records: yield {"repository": record["repository"], "project_id": record["id"]} def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) for record in super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): if record[self.cursor_field] > starting_point: yield record def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str: if stream_state: repository = stream_slice["repository"] project_id = str(stream_slice["project_id"]) stream_state_value = stream_state.get(repository, {}).get(project_id, {}).get(self.cursor_field) if stream_state_value: return max(self._start_date, stream_state_value) return self._start_date def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): repository = latest_record["repository"] project_id = str(latest_record["project_id"]) updated_state = latest_record[self.cursor_field] stream_state_value = current_stream_state.get(repository, {}).get(project_id, {}).get(self.cursor_field) if stream_state_value: updated_state = max(updated_state, stream_state_value) current_stream_state.setdefault(repository, {}).setdefault(project_id, {})[self.cursor_field] = updated_state return current_stream_state def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) record["project_id"] = stream_slice["project_id"] return record class ProjectCards(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/projects#list-project-cards """ cursor_field = "updated_at" def __init__(self, parent: HttpStream, start_date: str, **kwargs): super().__init__(**kwargs) self.parent = parent self._start_date = start_date def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"projects/columns/{stream_slice['column_id']}/cards" def stream_slices( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: parent_stream_slices = self.parent.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state ) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) for record in parent_records: yield {"repository": record["repository"], "project_id": record["project_id"], "column_id": record["id"]} def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) for record in super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): if record[self.cursor_field] > starting_point: yield record def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str: if stream_state: repository = stream_slice["repository"] project_id = str(stream_slice["project_id"]) column_id = str(stream_slice["column_id"]) stream_state_value = stream_state.get(repository, {}).get(project_id, {}).get(column_id, {}).get(self.cursor_field) if stream_state_value: return max(self._start_date, stream_state_value) return self._start_date def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): repository = latest_record["repository"] project_id = str(latest_record["project_id"]) column_id = str(latest_record["column_id"]) updated_state = latest_record[self.cursor_field] stream_state_value = current_stream_state.get(repository, {}).get(project_id, {}).get(column_id, {}).get(self.cursor_field) if stream_state_value: updated_state = max(updated_state, stream_state_value) current_stream_state.setdefault(repository, {}).setdefault(project_id, {}).setdefault(column_id, {})[ self.cursor_field ] = updated_state return current_stream_state def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record = super().transform(record=record, stream_slice=stream_slice) record["project_id"] = stream_slice["project_id"] record["column_id"] = stream_slice["column_id"] return record class Workflows(SemiIncrementalMixin, GithubStream): """ Get all workflows of a GitHub repository API documentation: https://docs.github.com/en/rest/actions/workflows#list-repository-workflows """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/actions/workflows" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: response = response.json().get("workflows") for record in response: yield self.transform(record=record, stream_slice=stream_slice) def convert_cursor_value(self, value): return pendulum.parse(value).in_tz(tz="UTC").format("YYYY-MM-DDTHH:mm:ss[Z]") class WorkflowRuns(SemiIncrementalMixin, GithubStream): """ Get all workflow runs for a GitHub repository API documentation: https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository """ # key for accessing slice value from record record_slice_key = ["repository", "full_name"] # https://docs.github.com/en/actions/managing-workflow-runs/re-running-workflows-and-jobs re_run_period = 32 # days def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/actions/runs" def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]: response = response.json().get("workflow_runs") for record in response: yield record def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: # Records in the workflows_runs stream are naturally descending sorted by `created_at` field. # On first sight this is not big deal because cursor_field is `updated_at`. # But we still can use `created_at` as a breakpoint because after 30 days period # https://docs.github.com/en/actions/managing-workflow-runs/re-running-workflows-and-jobs # workflows_runs records cannot be updated. It means if we initially fully synced stream on subsequent incremental sync we need # only to look behind on 30 days to find all records which were updated. start_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice) break_point = (pendulum.parse(start_point) - pendulum.duration(days=self.re_run_period)).to_iso8601_string() for record in super(SemiIncrementalMixin, self).read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): cursor_value = record[self.cursor_field] created_at = record["created_at"] if cursor_value > start_point: yield record if created_at < break_point: break class WorkflowJobs(SemiIncrementalMixin, GithubStream): """ Get all workflow jobs for a workflow run API documentation: https://docs.github.com/pt/rest/actions/workflow-jobs#list-jobs-for-a-workflow-run """ cursor_field = "completed_at" def __init__(self, parent: WorkflowRuns, **kwargs): super().__init__(**kwargs) self.parent = parent def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/actions/runs/{stream_slice['run_id']}/jobs" def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: parent_stream_state = None if stream_state is not None: parent_stream_state = {repository: {self.parent.cursor_field: v[self.cursor_field]} for repository, v in stream_state.items()} parent_stream_slices = self.parent.stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=parent_stream_state) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=parent_stream_state ) for record in parent_records: stream_slice["run_id"] = record["id"] yield from super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) def parse_response( self, response: requests.Response, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Iterable[Mapping]: for record in response.json()["jobs"]: if record.get(self.cursor_field): yield self.transform(record=record, stream_slice=stream_slice) def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> MutableMapping[str, Any]: params = super().request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) params["filter"] = "all" return params class TeamMembers(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/teams#list-team-members """ use_cache = True primary_key = ["id", "team_slug"] def __init__(self, parent: Teams, **kwargs): super().__init__(**kwargs) self.parent = parent def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/teams/{stream_slice['team_slug']}/members" def stream_slices( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: parent_stream_slices = self.parent.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state ) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) for record in parent_records: yield {"organization": record["organization"], "team_slug": record["slug"]} def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record["organization"] = stream_slice["organization"] record["team_slug"] = stream_slice["team_slug"] return record class TeamMemberships(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user """ primary_key = ["url"] def __init__(self, parent: TeamMembers, **kwargs): super().__init__(**kwargs) self.parent = parent def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/teams/{stream_slice['team_slug']}/memberships/{stream_slice['username']}" def stream_slices( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None ) -> Iterable[Optional[Mapping[str, Any]]]: parent_stream_slices = self.parent.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state ) for stream_slice in parent_stream_slices: parent_records = self.parent.read_records( sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ) for record in parent_records: yield {"organization": record["organization"], "team_slug": record["team_slug"], "username": record["login"]} def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any], **kwargs) -> Iterable[Mapping]: yield self.transform(response.json(), stream_slice=stream_slice) def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record["organization"] = stream_slice["organization"] record["team_slug"] = stream_slice["team_slug"] record["username"] = stream_slice["username"] return record class ContributorActivity(GithubStream): """ API docs: https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#get-all-contributor-commit-activity """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/stats/contributors" def request_headers(self, **kwargs) -> Mapping[str, Any]: params = super().request_headers(**kwargs) params.update({"Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"}) return params def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]: record["repository"] = stream_slice["repository"] record.update(record.pop("author")) return record def should_retry(self, response: requests.Response) -> bool: """ If the data hasn't been cached when you query a repository's statistics, you'll receive a 202 response, need to retry to get results see for more info https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#a-word-about-caching """ if super().should_retry(response) or response.status_code == requests.codes.ACCEPTED: return True def backoff_time(self, response: requests.Response) -> Optional[float]: return 10 if response.status_code == requests.codes.ACCEPTED else super().backoff_time(response)