# # MIT License # # Copyright (c) 2020 Airbyte # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # import os import time from abc import ABC from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union from urllib import parse import requests import vcr from airbyte_cdk.models import SyncMode from airbyte_cdk.sources.streams.http import HttpStream from requests.exceptions import HTTPError from vcr.cassette import Cassette def request_cache() -> Cassette: """ Builds VCR instance. It deletes file everytime we create it, normally should be called only once. We can't use NamedTemporaryFile here because yaml serializer doesn't work well with empty files. """ filename = "request_cache.yml" try: os.remove(filename) except FileNotFoundError: pass return vcr.use_cassette(str(filename), record_mode="new_episodes", serializer="yaml") class GithubStream(HttpStream, ABC): cache = request_cache() url_base = "https://api.github.com/" primary_key = "id" # GitHub pagination could be from 1 to 100. page_size = 100 stream_base_params = {} # Fields in below variable will be used for data clearing. Put there keys which represent: # - objects `{}`, like `user`, `actor` etc. # - lists `[]`, like `labels`, `assignees` etc. fields_to_minimize = () def __init__(self, repositories: List[str], **kwargs): super().__init__(**kwargs) self.repositories = repositories def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/{self.name}" def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, any]]]: for repository in self.repositories: yield {"repository": repository} def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: links = response.links if "next" in links: next_link = links["next"]["url"] parsed_link = parse.urlparse(next_link) page = dict(parse.parse_qsl(parsed_link.query)).get("page") return {"page": page} def should_retry(self, response: requests.Response) -> bool: # We don't call `super()` here because we have custom error handling and GitHub API sometimes returns strange # errors. So in `read_records()` we have custom error handling which don't require to call `super()` here. return response.headers.get("X-RateLimit-Remaining") == "0" or response.status_code in ( requests.codes.SERVER_ERROR, requests.codes.BAD_GATEWAY, ) def backoff_time(self, response: requests.Response) -> Union[int, float]: # This method is called if we run into the rate limit. GitHub limits requests to 5000 per hour and provides # `X-RateLimit-Reset` header which contains time when this hour will be finished and limits will be reset so # we again could have 5000 per another hour. if response.status_code == requests.codes.BAD_GATEWAY: return 0.5 reset_time = response.headers.get("X-RateLimit-Reset") backoff_time = float(reset_time) - time.time() if reset_time else 60 return max(backoff_time, 60) # This is a guarantee that no negative value will be returned. def read_records(self, stream_slice: Mapping[str, any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: try: yield from super().read_records(stream_slice=stream_slice, **kwargs) except HTTPError as e: error_msg = str(e) # This whole try/except situation in `read_records()` isn't good but right now in `self._send_request()` # function we have `response.raise_for_status()` so we don't have much choice on how to handle errors. # Bocked on https://github.com/airbytehq/airbyte/issues/3514. if e.response.status_code == requests.codes.FORBIDDEN: # When using the `check` method, we should raise an error if we do not have access to the repository. if isinstance(self, Repositories): raise e error_msg = ( f"Syncing `{self.name}` stream isn't available for repository " f"`{stream_slice['repository']}` and your `access_token`, seems like you don't have permissions for " f"this stream." ) elif e.response.status_code == requests.codes.NOT_FOUND and "/teams?" in error_msg: # For private repositories `Teams` stream is not available and we get "404 Client Error: Not Found for # url: https://api.github.com/orgs/sherifnada/teams?per_page=100" error. error_msg = f"Syncing `Team` stream isn't available for repository `{stream_slice['repository']}`." else: self.logger.error(f"Undefined error while reading records: {error_msg}") raise e self.logger.warn(error_msg) def request_params( self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, any] = None, next_page_token: Mapping[str, Any] = None ) -> MutableMapping[str, Any]: params = {"per_page": self.page_size} if next_page_token: params.update(next_page_token) params.update(self.stream_base_params) return params def request_headers(self, **kwargs) -> Mapping[str, Any]: # Without sending `User-Agent` header we will be getting `403 Client Error: Forbidden for url` error. return { "User-Agent": "PostmanRuntime/7.28.0", } def parse_response( self, response: requests.Response, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Iterable[Mapping]: for record in response.json(): # GitHub puts records in an array. yield self.transform(record=record, repository=stream_slice["repository"]) def transform(self, record: MutableMapping[str, Any], repository: str) -> MutableMapping[str, Any]: """ Use this method to: - remove excessive fields from record; - minify subelements in the record. For example, if you have `reviews` record which looks like this: { "id": 671782869, "node_id": "MDE3OlB1bGxSZXF1ZXN0UmV2aWV3NjcxNzgyODY5", "user": { "login": "keu", "id": 1619536, ... }, "body": "lgtm, just small comment", ... } `user` subelement contains almost all possible fields fo user and it's not optimal to store such data in `reviews` record. We may leave only `user.id` field and save in to `user_id` field in the record. So if you need to do something similar with your record you may use this method. """ for field in self.fields_to_minimize: field_value = record.pop(field, None) if field_value is None: record[field] = field_value elif isinstance(field_value, dict): record[f"{field}_id"] = field_value.get("id") if field_value else None elif isinstance(field_value, list): record[field] = [value.get("id") for value in field_value] record["repository"] = repository return record class SemiIncrementalGithubStream(GithubStream): """ Semi incremental streams are also incremental but with one difference, they: - read all records; - output only new records. This means that semi incremental streams read all records (like full_refresh streams) but do filtering directly in the code and output only latest records (like incremental streams). """ cursor_field = "updated_at" # This flag is used to indicate that current stream supports `sort` and `direction` request parameters and that # we should break processing records if possible. If `sort` is set to `updated` and `direction` is set to `desc` # this means that latest records will be at the beginning of the response and after we processed those latest # records we can just stop and not process other record. This will increase speed of each incremental stream # which supports those 2 request parameters. Currently only `IssueMilestones` and `PullRequests` streams are # supporting this. is_sorted_descending = False def __init__(self, start_date: str, **kwargs): super().__init__(**kwargs) self._start_date = start_date @property def state_checkpoint_interval(self) -> Optional[int]: if not self.is_sorted_descending: return self.page_size return None def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): """ Return the latest state by comparing the cursor value in the latest record with the stream's most recent state object and returning an updated state object. """ state_value = latest_cursor_value = latest_record.get(self.cursor_field) current_repository = latest_record["repository"] if current_stream_state.get(current_repository, {}).get(self.cursor_field): state_value = max(latest_cursor_value, current_stream_state[current_repository][self.cursor_field]) current_stream_state[current_repository] = {self.cursor_field: state_value} return current_stream_state def get_starting_point(self, stream_state: Mapping[str, Any], repository: str) -> str: start_point = self._start_date if stream_state and stream_state.get(repository, {}).get(self.cursor_field): start_point = max(start_point, stream_state[repository][self.cursor_field]) return start_point def read_records( self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_slice: Mapping[str, Any] = None, stream_state: Mapping[str, Any] = None, ) -> Iterable[Mapping[str, Any]]: start_point_map = {repo: self.get_starting_point(stream_state=stream_state, repository=repo) for repo in self.repositories} for record in super().read_records( sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state ): if record.get(self.cursor_field) > start_point_map[stream_slice["repository"]]: yield record elif self.is_sorted_descending and record.get(self.cursor_field) < start_point_map[stream_slice["repository"]]: break class IncrementalGithubStream(SemiIncrementalGithubStream): def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]: params = super().request_params(stream_state=stream_state, **kwargs) params["since"] = self.get_starting_point(stream_state=stream_state, repository=stream_slice["repository"]) return params class Repositories(GithubStream): """ This stream is technical and not intended for the user, it is used only to obtain repositories for organizations. API docs: https://docs.github.com/en/rest/reference/repos#list-organization-repositories """ def __init__(self, organizations: List[str], **kwargs): super(GithubStream, self).__init__(**kwargs) self.organizations = organizations def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, any]]]: for organization in self.organizations: yield {"organization": organization} def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"orgs/{stream_slice['organization']}/repos" def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: yield from response.json() # Below are full refresh streams class Assignees(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-assignees """ class Reviews(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request """ def path( self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None ) -> str: return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_request_number']}/reviews" def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: for stream_slice in super().stream_slices(**kwargs): pull_requests_stream = PullRequests(authenticator=self.authenticator, repositories=[stream_slice["repository"]], start_date="") for pull_request in pull_requests_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice): yield {"pull_request_number": pull_request["number"], "repository": stream_slice["repository"]} class Collaborators(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-repository-collaborators """ class IssueLabels(GithubStream): """ API docs: https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#list-labels-for-a-repository """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/labels" class Teams(GithubStream): """ API docs: https://docs.github.com/en/rest/reference/teams#list-teams """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: owner, _ = stream_slice["repository"].split("/") return f"orgs/{owner}/teams" # Below are semi incremental streams class Releases(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-releases """ cursor_field = "created_at" fields_to_minimize = ("author",) def transform(self, record: MutableMapping[str, Any], repository: str = None) -> MutableMapping[str, Any]: record = super().transform(record=record, repository=repository) assets = record.get("assets", []) for asset in assets: uploader = asset.pop("uploader", None) asset["uploader_id"] = uploader.get("id") if uploader else None return record class Events(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/activity#list-repository-events """ cursor_field = "created_at" fields_to_minimize = ( "actor", "repo", "org", ) class PullRequests(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/pulls#list-pull-requests """ page_size = 50 fields_to_minimize = ( "milestone", "assignee", "labels", "assignees", "requested_reviewers", "requested_teams", ) def __init__(self, **kwargs): super().__init__(**kwargs) self._first_read = True def read_records(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: """ Decide if this a first read or not by the presence of the state object """ self._first_read = not bool(stream_state) with self.cache: yield from super().read_records(stream_state=stream_state, **kwargs) def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/pulls" def transform(self, record: MutableMapping[str, Any], repository: str) -> MutableMapping[str, Any]: record = super().transform(record=record, repository=repository) for nested in ("head", "base"): entry = record.get(nested, {}) entry["repo_id"] = (record.get("head", {}).pop("repo", {}) or {}).get("id") return record def request_params(self, **kwargs) -> MutableMapping[str, Any]: base_params = super().request_params(**kwargs) # The very first time we read this stream we want to read ascending so we can save state in case of # a halfway failure. But if there is state, we read descending to allow incremental behavior. params = {"state": "all", "sort": "updated", "direction": "desc" if self.is_sorted_descending else "asc"} return {**base_params, **params} @property def is_sorted_descending(self) -> bool: """ Depending if there any state we read stream in ascending or descending order. """ return self._first_read class CommitComments(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository """ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/comments" class IssueMilestones(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-milestones """ is_sorted_descending = True fields_to_minimize = ("creator",) stream_base_params = { "state": "all", "sort": "updated", "direction": "desc", } def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/milestones" class Stargazers(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/activity#list-stargazers """ primary_key = "user_id" cursor_field = "starred_at" def request_headers(self, **kwargs) -> Mapping[str, Any]: base_headers = super().request_headers(**kwargs) # We need to send below header if we want to get `starred_at` field. See docs (Alternative response with # star creation timestamps) - https://docs.github.com/en/rest/reference/activity#list-stargazers. headers = {"Accept": "application/vnd.github.v3.star+json"} return {**base_headers, **headers} def transform(self, record: MutableMapping[str, Any], repository: str) -> MutableMapping[str, Any]: """ We need to provide the "user_id" for the primary_key attribute and don't remove the whole "user" block from the record. """ record = super().transform(record=record, repository=repository) record["user_id"] = record.get("user").get("id") return record class Projects(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/projects#list-repository-projects """ fields_to_minimize = ("creator",) stream_base_params = { "state": "all", } def request_headers(self, **kwargs) -> Mapping[str, Any]: base_headers = super().request_headers(**kwargs) # Projects stream requires sending following `Accept` header. If we won't sent it # we'll get `415 Client Error: Unsupported Media Type` error. headers = {"Accept": "application/vnd.github.inertia-preview+json"} return {**base_headers, **headers} class IssueEvents(SemiIncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-issue-events-for-a-repository """ cursor_field = "created_at" fields_to_minimize = ( "actor", "issue", ) def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/issues/events" # Below are incremental streams class Comments(IncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository """ page_size = 30 # `comments` is a large stream so it's better to set smaller page size. def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: return f"repos/{stream_slice['repository']}/issues/comments" class Commits(IncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository """ primary_key = "sha" cursor_field = "created_at" fields_to_minimize = ( "author", "committer", ) def transform(self, record: MutableMapping[str, Any], repository: str = None) -> MutableMapping[str, Any]: record = super().transform(record=record, repository=repository) # Record of the `commits` stream doesn't have an updated_at/created_at field at the top level (so we could # just write `record["updated_at"]` or `record["created_at"]`). Instead each record has such value in # `commit.author.date`. So the easiest way is to just enrich the record returned from API with top level # field `created_at` and use it as cursor_field. record["created_at"] = record["commit"]["author"]["date"] return record class Issues(IncrementalGithubStream): """ API docs: https://docs.github.com/en/rest/reference/issues#list-repository-issues """ page_size = 50 # `issues` is a large stream so it's better to set smaller page size. fields_to_minimize = ( "assignee", "milestone", "labels", "assignees", ) stream_base_params = { "state": "all", "sort": "updated", "direction": "asc", }