* Rough first implememtation of AvailabilityStrategy s
* Basic unit tests for AvailabilityStrategy and ScopedAvailabilityStrategy
* Make availability_strategy a property, separate out tests
* Remove from DeclarativeSource, remove Source parameter from methods, make default no AvailabilityStrategy
* Add skip stream if not available to read()
* Changes to CDK to get source-github working using AvailabilityStrategy, flakecheck
* reorganize cdk class, add HTTPAvailabilityStrategy test
* cleanup, docstrings
* source-github working with availabilitystrategy
* reorganize source-github
* source-github: resolve deprecation warning
* use correct import for HTTPAvailabilitystrategy
* pull out error handling into separate method
* use handle_http_error method
* Pass source and logger to check_connection method
* Add documentation links, handle 403 specifically
* Fix circular import
* Add AvailabilityStrategy to Stream and HTTPStream classes
* Remove AS from abstract_source, add to Stream, HTTPStream, AvailabilityStrategy unit tests passing for per-stream strategies
* Modify MockHttpStream to set no AvailabilityStrategy since source test mocking doesn't support this
* Move AvailabilityStrategy class to sources.streams
* Move HTTPAvailabilityStrategy to http module
* Use pascal case for HttpAvailabilityStrategy
* Remove docs message method :( and default to True availability on unhandled HTTPErrors
* add check_availability method to stream class
* Add optional source parameter
* Add test for connector-specific documentation, small tests refactor
* Add test that performs the read() function for stream with default availability strategy
* Add test for read function behavior when stream is unavailable
* Add 403 info in logger message
* Don't return error for other HTTPErrors
* Split up error handling into methods 'unavailable_error_codes' and 'get_reason_for_error'
* rework overrideable list of status codes to be a dict with reasons, to enforce that users provide reasons for all listed errors
* Fix incorrect typing
* Move HttpAvailability to its own module, fix flake errors
* Fix ScopedAvailabilityStrategy, docstrings and types for streams/availability_strategy.py
* Docstrings and types for core.py and http/availability_strategy.py
* Move _get_stream_slices to a StreamHelper class
* Docstrings + types for stream_helpers.py, cleanup test_availability.py
* Clean up test_source.py
* Move logic of getting the initial record from a stream to StreamHelper class
* Add changelog and bump minor version
* change 'is True' and 'is False' behavior
* use mocker.MagicMock
* Remove ScopedAvailabilityStrategy
* Don't except non-403 errors, check_stream uses availability_strategy if possible
* Move AvailabilityStrategy to stream level, fix tests
* make get_stream_slice public
* Attempt to refactor error code handling into repository-based and organization-based
* split into repository-based availabilitystrategy and organization-based availabilitystrategy
* refactor organization-based availabilitystrategy
* refactor repository-based availabilitystrategy, create separate ones for workflow_runs and projects
* Fix workflow runs availability strategy
* move availability strategies to a different module
* CDK: pass error to reasons_for_error_codes
* make get_stream_slice public
* Revert "make get_stream_slice public"
This reverts commit 9170fe5f59.
* Add tests for raising unhandled errors and retries are handled
* Add tests for CheckStream via AvailabilityStrategy
* Remove moved file
* bump CDK dependency
* Cleanup: Address review comments
* One more fix
* Update changelog and dockerfile version
232 lines
10 KiB
Python
232 lines
10 KiB
Python
#
|
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
|
|
from typing import Any, Dict, List, Mapping, Tuple
|
|
|
|
from airbyte_cdk import AirbyteLogger
|
|
from airbyte_cdk.models import SyncMode
|
|
from airbyte_cdk.sources import AbstractSource
|
|
from airbyte_cdk.sources.streams import Stream
|
|
from airbyte_cdk.sources.streams.http.requests_native_auth.token import MultipleTokenAuthenticator
|
|
|
|
from .streams import (
|
|
Assignees,
|
|
Branches,
|
|
Collaborators,
|
|
Comments,
|
|
CommitCommentReactions,
|
|
CommitComments,
|
|
Commits,
|
|
Deployments,
|
|
Events,
|
|
IssueCommentReactions,
|
|
IssueEvents,
|
|
IssueLabels,
|
|
IssueMilestones,
|
|
IssueReactions,
|
|
Issues,
|
|
Organizations,
|
|
ProjectCards,
|
|
ProjectColumns,
|
|
Projects,
|
|
PullRequestCommentReactions,
|
|
PullRequestCommits,
|
|
PullRequests,
|
|
PullRequestStats,
|
|
Releases,
|
|
Repositories,
|
|
RepositoryStats,
|
|
ReviewComments,
|
|
Reviews,
|
|
Stargazers,
|
|
Tags,
|
|
TeamMembers,
|
|
TeamMemberships,
|
|
Teams,
|
|
Users,
|
|
WorkflowJobs,
|
|
WorkflowRuns,
|
|
Workflows,
|
|
)
|
|
from .utils import read_full_refresh
|
|
|
|
TOKEN_SEPARATOR = ","
|
|
DEFAULT_PAGE_SIZE_FOR_LARGE_STREAM = 10
|
|
|
|
|
|
class SourceGithub(AbstractSource):
|
|
@staticmethod
|
|
def _get_org_repositories(config: Mapping[str, Any], authenticator: MultipleTokenAuthenticator) -> Tuple[List[str], List[str]]:
|
|
"""
|
|
Parse config.repository and produce two lists: organizations, repositories.
|
|
Args:
|
|
config (dict): Dict representing connector's config
|
|
authenticator(MultipleTokenAuthenticator): authenticator object
|
|
"""
|
|
config_repositories = set(filter(None, config["repository"].split(" ")))
|
|
if not config_repositories:
|
|
raise Exception("Field `repository` required to be provided for connect to Github API")
|
|
|
|
repositories = set()
|
|
organizations = set()
|
|
unchecked_repos = set()
|
|
unchecked_orgs = set()
|
|
|
|
for org_repos in config_repositories:
|
|
org, _, repos = org_repos.partition("/")
|
|
if repos == "*":
|
|
unchecked_orgs.add(org)
|
|
else:
|
|
unchecked_repos.add(org_repos)
|
|
|
|
if unchecked_orgs:
|
|
stream = Repositories(authenticator=authenticator, organizations=unchecked_orgs)
|
|
for record in read_full_refresh(stream):
|
|
repositories.add(record["full_name"])
|
|
organizations.add(record["organization"])
|
|
|
|
unchecked_repos = unchecked_repos - repositories
|
|
if unchecked_repos:
|
|
stream = RepositoryStats(
|
|
authenticator=authenticator,
|
|
repositories=unchecked_repos,
|
|
page_size_for_large_streams=config.get("page_size_for_large_streams", DEFAULT_PAGE_SIZE_FOR_LARGE_STREAM),
|
|
)
|
|
for record in read_full_refresh(stream):
|
|
repositories.add(record["full_name"])
|
|
organization = record.get("organization", {}).get("login")
|
|
if organization:
|
|
organizations.add(organization)
|
|
|
|
return list(organizations), list(repositories)
|
|
|
|
@staticmethod
|
|
def _get_authenticator(config: Dict[str, Any]):
|
|
# Before we supported oauth, personal_access_token was called `access_token` and it lived at the
|
|
# config root. So we first check to make sure any backwards compatbility is handled.
|
|
token = config.get("access_token")
|
|
if not token:
|
|
creds = config.get("credentials")
|
|
token = creds.get("access_token") or creds.get("personal_access_token")
|
|
tokens = [t.strip() for t in token.split(TOKEN_SEPARATOR)]
|
|
return MultipleTokenAuthenticator(tokens=tokens, auth_method="token")
|
|
|
|
@staticmethod
|
|
def _get_branches_data(selected_branches: str, full_refresh_args: Dict[str, Any] = None) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
|
|
selected_branches = set(filter(None, selected_branches.split(" ")))
|
|
|
|
# Get the default branch for each repository
|
|
default_branches = {}
|
|
repository_stats_stream = RepositoryStats(**full_refresh_args)
|
|
for stream_slice in repository_stats_stream.stream_slices(sync_mode=SyncMode.full_refresh):
|
|
default_branches.update(
|
|
{
|
|
repo_stats["full_name"]: repo_stats["default_branch"]
|
|
for repo_stats in repository_stats_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)
|
|
}
|
|
)
|
|
|
|
all_branches = []
|
|
branches_stream = Branches(**full_refresh_args)
|
|
for stream_slice in branches_stream.stream_slices(sync_mode=SyncMode.full_refresh):
|
|
for branch in branches_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice):
|
|
all_branches.append(f"{branch['repository']}/{branch['name']}")
|
|
|
|
# Create mapping of repository to list of branches to pull commits for
|
|
# If no branches are specified for a repo, use its default branch
|
|
branches_to_pull: Dict[str, List[str]] = {}
|
|
for repo in full_refresh_args["repositories"]:
|
|
repo_branches = []
|
|
for branch in selected_branches:
|
|
branch_parts = branch.split("/", 2)
|
|
if "/".join(branch_parts[:2]) == repo and branch in all_branches:
|
|
repo_branches.append(branch_parts[-1])
|
|
if not repo_branches:
|
|
repo_branches = [default_branches[repo]]
|
|
|
|
branches_to_pull[repo] = repo_branches
|
|
|
|
return default_branches, branches_to_pull
|
|
|
|
def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, Any]:
|
|
try:
|
|
authenticator = self._get_authenticator(config)
|
|
_, repositories = self._get_org_repositories(config=config, authenticator=authenticator)
|
|
if not repositories:
|
|
return False, "no valid repositories found"
|
|
return True, None
|
|
|
|
except Exception as e:
|
|
message = repr(e)
|
|
if "404 Client Error: Not Found for url: https://api.github.com/repos/" in message:
|
|
# HTTPError('404 Client Error: Not Found for url: https://api.github.com/repos/airbytehq/airbyte3?per_page=100')"
|
|
full_repo_name = message.split("https://api.github.com/repos/")[1]
|
|
full_repo_name = full_repo_name.split("?")[0]
|
|
message = f'Unknown repo name: "{full_repo_name}", use existing full repo name <organization>/<repository>'
|
|
elif "404 Client Error: Not Found for url: https://api.github.com/orgs/" in message:
|
|
# HTTPError('404 Client Error: Not Found for url: https://api.github.com/orgs/airbytehqBLA/repos?per_page=100')"
|
|
org_name = message.split("https://api.github.com/orgs/")[1]
|
|
org_name = org_name.split("/")[0]
|
|
message = f'Unknown organization name: "{org_name}"'
|
|
|
|
return False, message
|
|
|
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
|
authenticator = self._get_authenticator(config)
|
|
organizations, repositories = self._get_org_repositories(config=config, authenticator=authenticator)
|
|
page_size = config.get("page_size_for_large_streams", DEFAULT_PAGE_SIZE_FOR_LARGE_STREAM)
|
|
|
|
organization_args = {"authenticator": authenticator, "organizations": organizations}
|
|
organization_args_with_start_date = {**organization_args, "start_date": config["start_date"]}
|
|
repository_args = {"authenticator": authenticator, "repositories": repositories, "page_size_for_large_streams": page_size}
|
|
repository_args_with_start_date = {**repository_args, "start_date": config["start_date"]}
|
|
|
|
default_branches, branches_to_pull = self._get_branches_data(config.get("branch", ""), repository_args)
|
|
pull_requests_stream = PullRequests(**repository_args_with_start_date)
|
|
projects_stream = Projects(**repository_args_with_start_date)
|
|
project_columns_stream = ProjectColumns(projects_stream, **repository_args_with_start_date)
|
|
teams_stream = Teams(**organization_args)
|
|
team_members_stream = TeamMembers(parent=teams_stream, **repository_args)
|
|
workflow_runs_stream = WorkflowRuns(**repository_args_with_start_date)
|
|
|
|
return [
|
|
Assignees(**repository_args),
|
|
Branches(**repository_args),
|
|
Collaborators(**repository_args),
|
|
Comments(**repository_args_with_start_date),
|
|
CommitCommentReactions(**repository_args_with_start_date),
|
|
CommitComments(**repository_args_with_start_date),
|
|
Commits(**repository_args_with_start_date, branches_to_pull=branches_to_pull, default_branches=default_branches),
|
|
Deployments(**repository_args_with_start_date),
|
|
Events(**repository_args_with_start_date),
|
|
IssueCommentReactions(**repository_args_with_start_date),
|
|
IssueEvents(**repository_args_with_start_date),
|
|
IssueLabels(**repository_args),
|
|
IssueMilestones(**repository_args_with_start_date),
|
|
IssueReactions(**repository_args_with_start_date),
|
|
Issues(**repository_args_with_start_date),
|
|
Organizations(**organization_args),
|
|
ProjectCards(project_columns_stream, **repository_args_with_start_date),
|
|
project_columns_stream,
|
|
projects_stream,
|
|
PullRequestCommentReactions(**repository_args_with_start_date),
|
|
PullRequestCommits(parent=pull_requests_stream, **repository_args),
|
|
PullRequestStats(**repository_args_with_start_date),
|
|
pull_requests_stream,
|
|
Releases(**repository_args_with_start_date),
|
|
Repositories(**organization_args_with_start_date),
|
|
ReviewComments(**repository_args_with_start_date),
|
|
Reviews(**repository_args_with_start_date),
|
|
Stargazers(**repository_args_with_start_date),
|
|
Tags(**repository_args),
|
|
teams_stream,
|
|
team_members_stream,
|
|
Users(**organization_args),
|
|
Workflows(**repository_args_with_start_date),
|
|
workflow_runs_stream,
|
|
WorkflowJobs(parent=workflow_runs_stream, **repository_args_with_start_date),
|
|
TeamMemberships(parent=team_members_stream, **repository_args),
|
|
]
|