diff --git a/pyproject.toml b/pyproject.toml index 683dc95b..b2a35cc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,13 @@ select = [ "E", # pycodestyle (errors) "W", # pycodestyle (warnings) "I", # isort + "N", # pep8-naming "UP", # pyupgrade + "YTT", # flake8-2020 + "ANN", # flake8-annotations + "B", # flake8-bugbear + "A", # flake8-builtins + "C4", # flake8-comprehensions "DTZ", # flake8-datetimez "FA", # flake8-future-annotations "SIM", # flake8-simplify @@ -87,3 +93,6 @@ select = [ "FURB", # refurb "RUF", # Ruff-specific rules ] + +[tool.ruff.lint.per-file-ignores] +"tap_github/tests/*" = ["ANN"] diff --git a/tap_github/authenticator.py b/tap_github/authenticator.py index 0ca79b14..4e5f512e 100644 --- a/tap_github/authenticator.py +++ b/tap_github/authenticator.py @@ -33,8 +33,8 @@ def __init__( self, token: str | None, rate_limit_buffer: int | None = None, - logger: Any | None = None, - ): + logger: Any | None = None, # noqa: ANN401 + ) -> None: """Init TokenManager info.""" self.token = token self.logger = logger @@ -48,7 +48,7 @@ def __init__( else self.DEFAULT_RATE_LIMIT_BUFFER ) - def update_rate_limit(self, response_headers: Any) -> None: + def update_rate_limit(self, response_headers: Any) -> None: # noqa: ANN401 self.rate_limit = int(response_headers["X-RateLimit-Limit"]) self.rate_limit_remaining = int(response_headers["X-RateLimit-Remaining"]) self.rate_limit_reset = datetime.fromtimestamp( @@ -97,7 +97,12 @@ def has_calls_remaining(self) -> bool: class PersonalTokenManager(TokenManager): """A class to store token rate limiting information.""" - def __init__(self, token: str, rate_limit_buffer: int | None = None, **kwargs): + def __init__( + self, + token: str, + rate_limit_buffer: int | None = None, + **kwargs, # noqa: ANN003 + ) -> None: """Init PersonalTokenRateLimit info.""" super().__init__(token, rate_limit_buffer=rate_limit_buffer, **kwargs) @@ -166,8 +171,8 @@ def __init__( env_key: str, rate_limit_buffer: int | None = None, expiry_time_buffer: int | None = None, - **kwargs, - ): + **kwargs, # noqa: ANN003 + ) -> None: if rate_limit_buffer is None: rate_limit_buffer = self.DEFAULT_RATE_LIMIT_BUFFER super().__init__(None, rate_limit_buffer=rate_limit_buffer, **kwargs) @@ -184,7 +189,7 @@ def __init__( self.token_expires_at: datetime | None = None self.claim_token() - def claim_token(self): + def claim_token(self) -> None: """Updates the TokenManager's token and token_expires_at attributes. The outcome will be _either_ that self.token is updated to a newly claimed valid token and @@ -242,7 +247,7 @@ class GitHubTokenAuthenticator(APIAuthenticatorBase): """Base class for offloading API auth.""" @staticmethod - def get_env(): + def get_env(): # noqa: ANN205 return dict(environ) def prepare_tokens(self) -> list[TokenManager]: diff --git a/tap_github/client.py b/tap_github/client.py index 2d106413..b4ff6a03 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -65,8 +65,10 @@ def http_headers(self) -> dict[str, str]: return headers def get_next_page_token( - self, response: requests.Response, previous_token: Any | None - ) -> Any | None: + self, + response: requests.Response, + previous_token: Any | None, # noqa: ANN401 + ) -> Any | None: # noqa: ANN401 """Return a token for identifying next page or None if no more pages.""" if ( previous_token @@ -136,7 +138,9 @@ def get_next_page_token( return (previous_token or 1) + 1 def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" params: dict = {"per_page": self.MAX_PER_PAGE} @@ -328,8 +332,10 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: yield from extract_jsonpath(self.query_jsonpath, input=resp_json) def get_next_page_token( - self, response: requests.Response, previous_token: Any | None - ) -> Any | None: + self, + response: requests.Response, + previous_token: Any | None, # noqa: ANN401 + ) -> Any | None: # noqa: ANN401 """ Return a dict of cursors for identifying next page or None if no more pages. @@ -370,7 +376,7 @@ def get_next_page_token( # We leverage previous_token to remember the pagination cursors # for indices below max_pagination_index. - next_page_cursors: dict[str, str] = dict() + next_page_cursors: dict[str, str] = {} for key, value in (previous_token or {}).items(): # Only keep pagination info for indices below max_pagination_index. pagination_index = int(str(key).split("_")[1]) @@ -392,10 +398,12 @@ def get_next_page_token( return next_page_cursors def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" - params = context.copy() if context else dict() + params = context.copy() if context else {} params["per_page"] = self.MAX_PER_PAGE if next_page_token: params.update(next_page_token) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index ddbff242..5d31b37c 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -35,7 +35,9 @@ class RepositoryStream(GitHubRestStream): replication_key = "updated_at" def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -84,13 +86,13 @@ class TempStream(GitHubGraphqlStream): th.Property("databaseId", th.IntegerType), ).to_dict() - def __init__(self, tap, repo_list) -> None: + def __init__(self, tap, repo_list) -> None: # noqa: ANN001 super().__init__(tap) self.repo_list = repo_list @property def query(self) -> str: - chunks = list() + chunks = [] for i, repo in enumerate(self.repo_list): chunks.append( f'repo{i}: repository(name: "{repo[1]}", owner: "{repo[0]}") ' @@ -114,7 +116,7 @@ def validate_response(self, response: requests.Response) -> None: if len(repo_list) < 1: return [] - repos_with_ids: list = list() + repos_with_ids: list = [] temp_stream = TempStream(self._tap, list(repo_list)) # replace manually provided org/repo values by the ones obtained # from github api. This guarantees that case is correct in the output data. @@ -166,10 +168,8 @@ def partitions(self) -> list[dict[str, str]] | None: ] if "repositories" in self.config: - split_repo_names = list( - map(lambda s: s.split("/"), self.config["repositories"]) - ) - augmented_repo_list = list() + split_repo_names = [s.split("/") for s in self.config["repositories"]] + augmented_repo_list = [] # chunk requests to the graphql endpoint to avoid timeouts and other # obscure errors that the api doesn't say much about. The actual limit # seems closer to 1000, use half that to stay safe. @@ -640,7 +640,9 @@ class MilestonesStream(GitHubRestStream): ignore_parent_replication_key = True def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -840,7 +842,9 @@ class IssuesStream(GitHubRestStream): state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"] def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -1198,7 +1202,9 @@ class PullRequestsStream(GitHubRestStream): use_fake_since_parameter = True def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -1587,7 +1593,9 @@ class AnonymousContributorsStream(GitHubRestStream): tolerated_http_errors: ClassVar[list[int]] = [204] def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -1626,7 +1634,7 @@ class StargazersStream(GitHubRestStream): # GitHub is missing the "since" parameter on this endpoint. use_fake_since_parameter = True - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: # noqa: ANN002, ANN003 super().__init__(*args, **kwargs) # TODO - remove warning with next release. self.logger.warning( @@ -1676,7 +1684,7 @@ class StargazersGraphqlStream(GitHubGraphqlStream): # The parent repository object changes if the number of stargazers changes. ignore_parent_replication_key = False - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: # noqa: ANN002, ANN003 super().__init__(*args, **kwargs) # TODO - remove warning with next release. self.logger.warning( @@ -1693,8 +1701,10 @@ def post_process(self, row: dict, context: dict | None = None) -> dict: return row def get_next_page_token( - self, response: requests.Response, previous_token: Any | None - ) -> Any | None: + self, + response: requests.Response, + previous_token: Any | None, # noqa: ANN401 + ) -> Any | None: # noqa: ANN401 """ Exit early if a since parameter is provided. """ @@ -2092,7 +2102,7 @@ class WorkflowRunJobsStream(GitHubRestStream): th.Property("runner_group_name", th.StringType), ).to_dict() - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: # noqa: ANN002, ANN003 super().__init__(*args, **kwargs) self._schema_emitted = False @@ -2101,13 +2111,15 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: yield from extract_jsonpath(self.records_jsonpath, input=response.json()) def get_url_params( - self, context: dict | None, next_page_token: Any | None + self, + context: dict | None, + next_page_token: Any | None, # noqa: ANN401 ) -> dict[str, Any]: params = super().get_url_params(context, next_page_token) params["filter"] = "all" return params - def _write_schema_message(self): + def _write_schema_message(self) -> None: """Write out a SCHEMA message with the stream schema.""" if not self._schema_emitted: super()._write_schema_message() diff --git a/tap_github/scraping.py b/tap_github/scraping.py index 9e736129..72208583 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -112,10 +112,10 @@ def parse_counter(tag: Tag | NavigableString | None) -> int: else: title_string = cast(str, title[0]) return int(title_string.strip().replace(",", "").replace("+", "")) - except (KeyError, ValueError): + except (KeyError, ValueError) as e: raise IndexError( f"Could not parse counter {tag}. Maybe the GitHub page format has changed?" - ) + ) from e def scrape_metrics( @@ -130,11 +130,11 @@ def scrape_metrics( try: issues = parse_counter(soup.find("span", id="issues-repo-tab-count")) prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count")) - except IndexError: + except IndexError as e: # These two items should exist. We raise an error if we could not find them. raise IndexError( "Could not find issues or prs info. Maybe the GitHub page format has changed?" # noqa: E501 - ) + ) from e dependents_node = soup.find(string=used_by_regex) # verify that we didn't hit some random text in the page. diff --git a/tap_github/streams.py b/tap_github/streams.py index c8316e0a..46b40613 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -63,7 +63,7 @@ class Streams(Enum): valid_queries: set[str] streams: list[type[Stream]] - def __init__(self, valid_queries: set[str], streams: list[type[Stream]]): + def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None: self.valid_queries = valid_queries self.streams = streams @@ -124,5 +124,5 @@ def __init__(self, valid_queries: set[str], streams: list[type[Stream]]): ) @classmethod - def all_valid_queries(cls): + def all_valid_queries(cls) -> set[str]: return set.union(*[stream.valid_queries for stream in Streams]) diff --git a/tap_github/tap.py b/tap_github/tap.py index c32af396..97f7c601 100644 --- a/tap_github/tap.py +++ b/tap_github/tap.py @@ -13,20 +13,20 @@ class TapGitHub(Tap): - """GitHub tap class.""" + """Singer tap for the GitHub API.""" name = "tap-github" package_name = "meltanolabs-tap-github" @classproperty - def logger(cls) -> logging.Logger: + def logger(cls: type[TapGitHub]) -> logging.Logger: # noqa: N805 """Get logger. Returns: Logger with local LOGLEVEL. LOGLEVEL from env takes priority. """ - LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() + LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() # noqa: N806 assert ( LOGLEVEL in logging._levelToName.values() ), f"Invalid LOGLEVEL configuration: {LOGLEVEL}" diff --git a/tap_github/tests/fixtures.py b/tap_github/tests/fixtures.py index 8cd75874..bcc88784 100644 --- a/tap_github/tests/fixtures.py +++ b/tap_github/tests/fixtures.py @@ -105,7 +105,7 @@ def alternative_sync_chidren(self, child_context: dict, no_sync: bool = True) -> for child_stream in self.child_streams: # Use org:write access level credentials for collaborators stream if child_stream.name in ["collaborators"]: - ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") + ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") # noqa: N806 # TODO - Fix collaborators tests, likely by mocking API responses directly. # Currently we have to bypass them as they are failing frequently. if not ORG_LEVEL_TOKEN or no_sync: @@ -113,7 +113,7 @@ def alternative_sync_chidren(self, child_context: dict, no_sync: bool = True) -> 'No "ORG_LEVEL_TOKEN" found. Skipping collaborators stream sync.' ) continue - SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN") + SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN") # noqa: N806 os.environ["GITHUB_TOKEN"] = ORG_LEVEL_TOKEN child_stream.sync(context=child_context) os.environ["GITHUB_TOKEN"] = SAVED_GTHUB_TOKEN or "" diff --git a/tap_github/tests/test_tap.py b/tap_github/tests/test_tap.py index cbd26c23..fb7fa303 100644 --- a/tap_github/tests/test_tap.py +++ b/tap_github/tests/test_tap.py @@ -155,10 +155,8 @@ def test_last_state_message_is_valid(capsys, repo_list_config): # noqa: F811 ] ) latest_updated_at = max( - map( - lambda record: isoparse(json.loads(record)["record"]["updated_at"]), - issue_comments_records, - ) + isoparse(json.loads(record)["record"]["updated_at"]) + for record in issue_comments_records ) assert last_state_updated_at == latest_updated_at diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index 14df91b3..4433663e 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -14,6 +14,8 @@ if TYPE_CHECKING: from collections.abc import Iterable + from singer_sdk.tap_base import Tap + class UserStream(GitHubRestStream): """Defines 'User' stream.""" @@ -35,7 +37,7 @@ def partitions(self) -> list[dict] | None: if "user_usernames" in self.config: input_user_list = self.config["user_usernames"] - augmented_user_list = list() + augmented_user_list = [] # chunk requests to the graphql endpoint to avoid timeouts and other # obscure errors that the api doesn't say much about. The actual limit # seems closer to 1000, use half that to stay safe. @@ -50,7 +52,7 @@ def partitions(self) -> list[dict] | None: return augmented_user_list elif "user_ids" in self.config: - return [{"id": id} for id in self.config["user_ids"]] + return [{"id": user_id} for user_id in self.config["user_ids"]] return None def get_child_context(self, record: dict, context: dict | None) -> dict: @@ -76,13 +78,13 @@ class TempStream(GitHubGraphqlStream): th.Property("databaseId", th.IntegerType), ).to_dict() - def __init__(self, tap, user_list) -> None: + def __init__(self, tap: Tap, user_list: list[str]) -> None: super().__init__(tap) self.user_list = user_list @property def query(self) -> str: - chunks = list() + chunks = [] for i, user in enumerate(self.user_list): # we use the `repositoryOwner` query which is the only one that # works on both users and orgs with graphql. REST is less picky @@ -96,10 +98,10 @@ def query(self) -> str: if len(user_list) < 1: return [] - users_with_ids: list = list() + users_with_ids: list = [] temp_stream = TempStream(self._tap, list(user_list)) - databaseIdPattern: re.Pattern = re.compile( + database_id_pattern: re.Pattern = re.compile( r"https://avatars.githubusercontent.com/u/(\d+)?.*" ) # replace manually provided org/repo values by the ones obtained @@ -124,10 +126,10 @@ def query(self) -> str: continue # the databaseId (in graphql language) is not available on # repositoryOwner, so we parse the avatarUrl to get it :/ - m = databaseIdPattern.match(record[item]["avatarUrl"]) + m = database_id_pattern.match(record[item]["avatarUrl"]) if m is not None: - dbId = m.group(1) - users_with_ids.append({"username": username, "user_id": dbId}) + db_id = m.group(1) + users_with_ids.append({"username": username, "user_id": db_id}) else: # If we get here, github's API is not returning what # we expected, so it's most likely a breaking change on diff --git a/tap_github/utils/filter_stdout.py b/tap_github/utils/filter_stdout.py index e4fd6a66..e677ee85 100644 --- a/tap_github/utils/filter_stdout.py +++ b/tap_github/utils/filter_stdout.py @@ -11,17 +11,17 @@ class FilterStdOutput: """Filter out stdout/sterr given a regex pattern.""" - def __init__(self, stream: TextIO, re_pattern: str | Pattern): + def __init__(self, stream: TextIO, re_pattern: str | Pattern) -> None: self.stream = stream self.pattern = ( re.compile(re_pattern) if isinstance(re_pattern, str) else re_pattern ) self.triggered = False - def __getattr__(self, attr_name: str): + def __getattr__(self, attr_name: str) -> object: return getattr(self.stream, attr_name) - def write(self, data): + def write(self, data: str) -> None: if data == "\n" and self.triggered: self.triggered = False else: @@ -32,12 +32,12 @@ def write(self, data): # caught bad pattern self.triggered = True - def flush(self): + def flush(self) -> None: self.stream.flush() @contextlib.contextmanager -def nostdout(): +def nostdout() -> None: save_stdout = sys.stdout sys.stdout = io.StringIO() yield