🎉 Source GitHub: Use CDK caching and convert PR-related streams to incremental (#7250)
* Source GitHub: Use CDK caching and convert PR-related streams to incremental * Remove extra change * Consolidate * Address comments * Fix integration test config * Fix merge * Update sample state * Bump release version * Bump version * Address feedback * Bump version * Fix formatting
This commit is contained in:
@@ -217,7 +217,7 @@
|
||||
- name: GitHub
|
||||
sourceDefinitionId: ef69ef6e-aa7f-4af1-a01d-ef775033524e
|
||||
dockerRepository: airbyte/source-github
|
||||
dockerImageTag: 0.2.9
|
||||
dockerImageTag: 0.2.10
|
||||
documentationUrl: https://docs.airbyte.io/integrations/sources/github
|
||||
icon: github.svg
|
||||
sourceType: api
|
||||
|
||||
@@ -12,5 +12,5 @@ RUN pip install .
|
||||
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
|
||||
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
|
||||
|
||||
LABEL io.airbyte.version=0.2.9
|
||||
LABEL io.airbyte.version=0.2.10
|
||||
LABEL io.airbyte.name=airbyte/source-github
|
||||
|
||||
@@ -25,9 +25,11 @@ tests:
|
||||
issue_milestones: ["airbytehq/integration-test", "updated_at"]
|
||||
issues: ["airbytehq/integration-test", "updated_at"]
|
||||
projects: ["airbytehq/integration-test", "updated_at"]
|
||||
pull_request_stats: ["airbytehq/integration-test", "updated_at"]
|
||||
pull_requests: ["airbytehq/integration-test", "updated_at"]
|
||||
releases: ["airbytehq/integration-test", "created_at"]
|
||||
review_comments: ["airbytehq/integration-test", "updated_at"]
|
||||
reviews: ["airbytehq/integration-test", "submitted_at"]
|
||||
stargazers: ["airbytehq/integration-test", "starred_at"]
|
||||
full_refresh:
|
||||
- config_path: "secrets/config.json"
|
||||
|
||||
@@ -39,6 +39,11 @@
|
||||
"updated_at": "2121-06-28T17:24:51Z"
|
||||
}
|
||||
},
|
||||
"pull_request_stats": {
|
||||
"airbytehq/integration-test": {
|
||||
"updated_at": "2121-06-29T02:04:57Z"
|
||||
}
|
||||
},
|
||||
"pull_requests": {
|
||||
"airbytehq/integration-test": {
|
||||
"updated_at": "2121-06-28T23:36:35Z"
|
||||
@@ -54,6 +59,11 @@
|
||||
"updated_at": "2121-06-23T23:57:07Z"
|
||||
}
|
||||
},
|
||||
"reviews": {
|
||||
"airbytehq/integration-test": {
|
||||
"submitted_at": "2121-06-29T02:04:57Z"
|
||||
}
|
||||
},
|
||||
"stargazers": {
|
||||
"airbytehq/integration-test": {
|
||||
"starred_at": "2121-06-29T02:04:57Z"
|
||||
|
||||
@@ -198,11 +198,14 @@
|
||||
"stream": {
|
||||
"name": "pull_request_stats",
|
||||
"json_schema": {},
|
||||
"supported_sync_modes": ["full_refresh"],
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"source_defined_cursor": true,
|
||||
"default_cursor_field": ["updated_at"],
|
||||
"source_defined_primary_key": [["id"]]
|
||||
},
|
||||
"sync_mode": "full_refresh",
|
||||
"destination_sync_mode": "overwrite"
|
||||
"sync_mode": "incremental",
|
||||
"destination_sync_mode": "append",
|
||||
"cursor_field": ["updated_at"]
|
||||
},
|
||||
{
|
||||
"stream": {
|
||||
@@ -257,11 +260,14 @@
|
||||
"stream": {
|
||||
"name": "reviews",
|
||||
"json_schema": {},
|
||||
"supported_sync_modes": ["full_refresh"],
|
||||
"supported_sync_modes": ["full_refresh", "incremental"],
|
||||
"source_defined_cursor": true,
|
||||
"default_cursor_field": ["submitted_at"],
|
||||
"source_defined_primary_key": [["id"]]
|
||||
},
|
||||
"sync_mode": "full_refresh",
|
||||
"destination_sync_mode": "overwrite"
|
||||
"sync_mode": "incremental",
|
||||
"destination_sync_mode": "append",
|
||||
"cursor_field": ["submitted_at"]
|
||||
},
|
||||
{
|
||||
"stream": {
|
||||
|
||||
@@ -29,6 +29,11 @@
|
||||
"created_at": "2021-06-23T23:57:07Z"
|
||||
}
|
||||
},
|
||||
"pull_request_stats": {
|
||||
"airbytehq/integration-test": {
|
||||
"updated_at": "2021-08-30T12:01:15Z"
|
||||
}
|
||||
},
|
||||
"pull_requests": {
|
||||
"airbytehq/integration-test": {
|
||||
"updated_at": "2021-06-28T23:36:35Z"
|
||||
@@ -53,5 +58,10 @@
|
||||
"airbytehq/integration-test": {
|
||||
"created_at": "2021-06-30T10:04:41Z"
|
||||
}
|
||||
},
|
||||
"reviews": {
|
||||
"airbytehq/integration-test": {
|
||||
"submitted_at": "2021-08-30T12:01:15Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,10 @@
|
||||
},
|
||||
"changed_files": {
|
||||
"type": ["null", "integer"]
|
||||
},
|
||||
"updated_at": {
|
||||
"type": ["null", "string"],
|
||||
"format": "date-time"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,12 +179,12 @@ class SourceGithub(AbstractSource):
|
||||
Organizations(**organization_args),
|
||||
Projects(**repository_args_with_start_date),
|
||||
PullRequestCommentReactions(**repository_args_with_start_date),
|
||||
PullRequestStats(parent=pull_requests_stream, **repository_args),
|
||||
PullRequestStats(parent=pull_requests_stream, **repository_args_with_start_date),
|
||||
PullRequests(**repository_args_with_start_date),
|
||||
Releases(**repository_args_with_start_date),
|
||||
Repositories(**organization_args),
|
||||
ReviewComments(**repository_args_with_start_date),
|
||||
Reviews(parent=pull_requests_stream, **repository_args),
|
||||
Reviews(parent=pull_requests_stream, **repository_args_with_start_date),
|
||||
Stargazers(**repository_args_with_start_date),
|
||||
Tags(**repository_args),
|
||||
Teams(**organization_args),
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
|
||||
#
|
||||
|
||||
import os
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
@@ -10,43 +9,16 @@ from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
||||
from urllib import parse
|
||||
|
||||
import requests
|
||||
import vcr
|
||||
from airbyte_cdk.models import SyncMode
|
||||
from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream
|
||||
from requests.exceptions import HTTPError
|
||||
from vcr.cassette import Cassette
|
||||
|
||||
|
||||
def request_cache() -> Cassette:
|
||||
"""
|
||||
Builds VCR instance.
|
||||
It deletes file everytime we create it, normally should be called only once.
|
||||
We can't use NamedTemporaryFile here because yaml serializer doesn't work well with empty files.
|
||||
"""
|
||||
filename = "request_cache.yml"
|
||||
try:
|
||||
os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
return vcr.use_cassette(str(filename), record_mode="new_episodes", serializer="yaml")
|
||||
|
||||
|
||||
class GithubStream(HttpStream, ABC):
|
||||
cache = request_cache()
|
||||
url_base = "https://api.github.com/"
|
||||
|
||||
# To prevent dangerous behavior, the `vcr` library prohibits the use of nested caching.
|
||||
# Here's an example of dangerous behavior:
|
||||
# cache = Cassette.use('whatever')
|
||||
# with cache:
|
||||
# with cache:
|
||||
# pass
|
||||
#
|
||||
# Therefore, we will only use `cache` for the top-level stream, so as not to cause possible difficulties.
|
||||
top_level_stream = True
|
||||
|
||||
primary_key = "id"
|
||||
use_cache = True
|
||||
|
||||
# GitHub pagination could be from 1 to 100.
|
||||
page_size = 100
|
||||
@@ -100,11 +72,7 @@ class GithubStream(HttpStream, ABC):
|
||||
|
||||
def read_records(self, stream_slice: Mapping[str, any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
|
||||
try:
|
||||
if self.top_level_stream:
|
||||
with self.cache:
|
||||
yield from super().read_records(stream_slice=stream_slice, **kwargs)
|
||||
else:
|
||||
yield from super().read_records(stream_slice=stream_slice, **kwargs)
|
||||
yield from super().read_records(stream_slice=stream_slice, **kwargs)
|
||||
except HTTPError as e:
|
||||
error_msg = str(e)
|
||||
|
||||
@@ -422,6 +390,7 @@ class PullRequests(SemiIncrementalGithubStream):
|
||||
"""
|
||||
|
||||
page_size = 50
|
||||
first_read_override_key = "first_read_override"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -431,7 +400,7 @@ class PullRequests(SemiIncrementalGithubStream):
|
||||
"""
|
||||
Decide if this a first read or not by the presence of the state object
|
||||
"""
|
||||
self._first_read = not bool(stream_state)
|
||||
self._first_read = not bool(stream_state) or stream_state.get(self.first_read_override_key, False)
|
||||
yield from super().read_records(stream_state=stream_state, **kwargs)
|
||||
|
||||
def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
|
||||
@@ -459,7 +428,7 @@ class PullRequests(SemiIncrementalGithubStream):
|
||||
"""
|
||||
Depending if there any state we read stream in ascending or descending order.
|
||||
"""
|
||||
return self._first_read
|
||||
return not self._first_read
|
||||
|
||||
|
||||
class CommitComments(SemiIncrementalGithubStream):
|
||||
@@ -686,8 +655,8 @@ class ReviewComments(IncrementalGithubStream):
|
||||
# Pull request substreams
|
||||
|
||||
|
||||
class PullRequestSubstream(HttpSubStream, GithubStream, ABC):
|
||||
top_level_stream = False
|
||||
class PullRequestSubstream(HttpSubStream, SemiIncrementalGithubStream, ABC):
|
||||
use_cache = False
|
||||
|
||||
def __init__(self, parent: PullRequests, **kwargs):
|
||||
super().__init__(parent=parent, **kwargs)
|
||||
@@ -695,14 +664,33 @@ class PullRequestSubstream(HttpSubStream, GithubStream, ABC):
|
||||
def stream_slices(
|
||||
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
|
||||
) -> Iterable[Optional[Mapping[str, Any]]]:
|
||||
parent_stream_slices = super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=stream_state)
|
||||
|
||||
"""
|
||||
Override the parent PullRequests stream configuration to always fetch records in ascending order
|
||||
"""
|
||||
parent_state = deepcopy(stream_state) or {}
|
||||
parent_state[PullRequests.first_read_override_key] = True
|
||||
parent_stream_slices = super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=parent_state)
|
||||
for parent_stream_slice in parent_stream_slices:
|
||||
yield {
|
||||
"pull_request_number": parent_stream_slice["parent"]["number"],
|
||||
"repository": parent_stream_slice["parent"]["repository"],
|
||||
}
|
||||
|
||||
def read_records(
|
||||
self,
|
||||
sync_mode: SyncMode,
|
||||
cursor_field: List[str] = None,
|
||||
stream_slice: Mapping[str, Any] = None,
|
||||
stream_state: Mapping[str, Any] = None,
|
||||
) -> Iterable[Mapping[str, Any]]:
|
||||
"""
|
||||
We've already determined the list of pull requests to run the stream against.
|
||||
Skip the start_point_map and cursor_field logic in SemiIncrementalGithubStream.read_records.
|
||||
"""
|
||||
yield from super(SemiIncrementalGithubStream, self).read_records(
|
||||
sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
|
||||
)
|
||||
|
||||
|
||||
class PullRequestStats(PullRequestSubstream):
|
||||
"""
|
||||
@@ -731,11 +719,21 @@ class Reviews(PullRequestSubstream):
|
||||
API docs: https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request
|
||||
"""
|
||||
|
||||
cursor_field = "submitted_at"
|
||||
|
||||
def path(
|
||||
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
|
||||
) -> str:
|
||||
return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_request_number']}/reviews"
|
||||
|
||||
# Set the parent stream state's cursor field before fetching its records
|
||||
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
|
||||
parent_state = deepcopy(stream_state) or {}
|
||||
for repository in self.repositories:
|
||||
if repository in parent_state and self.cursor_field in parent_state[repository]:
|
||||
parent_state[repository][self.parent.cursor_field] = parent_state[repository][self.cursor_field]
|
||||
yield from super().stream_slices(stream_state=parent_state, **kwargs)
|
||||
|
||||
|
||||
# Reactions streams
|
||||
|
||||
@@ -743,7 +741,7 @@ class Reviews(PullRequestSubstream):
|
||||
class ReactionStream(GithubStream, ABC):
|
||||
|
||||
parent_key = "id"
|
||||
top_level_stream = False
|
||||
use_cache = False
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._stream_kwargs = deepcopy(kwargs)
|
||||
|
||||
@@ -92,6 +92,7 @@ Your token should have at least the `repo` scope. Depending on which streams you
|
||||
|
||||
| Version | Date | Pull Request | Subject |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| 0.2.10 | 2021-01-03 | [7250](https://github.com/airbytehq/airbyte/pull/7250) | Use CDK caching and convert PR-related streams to incremental |
|
||||
| 0.2.9 | 2021-12-29 | [9179](https://github.com/airbytehq/airbyte/pull/9179) | Use default retry delays on server error responses |
|
||||
| 0.2.8 | 2021-12-07 | [8524](https://github.com/airbytehq/airbyte/pull/8524) | Update connector fields title/description |
|
||||
| 0.2.7 | 2021-12-06 | [8518](https://github.com/airbytehq/airbyte/pull/8518) | Add connection retry with Github |
|
||||
|
||||
Reference in New Issue
Block a user