Skip to content

Commit

Permalink
Fix issues surfaced in meltano demo project (#77)
Browse files Browse the repository at this point in the history
* use numeric project id, add parent relationships
* handle base url with or without version path
* add request_cache logging
* gitignore api_caches
* log full url paths
* other misc. bug fixes
  • Loading branch information
aaronsteers authored May 17, 2022
1 parent 2e0e01f commit 7d285b8
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 107 deletions.
6 changes: 0 additions & 6 deletions .github/workflows/ci_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ jobs:
TAP_GITLAB_PRIVATE_TOKEN: ${{secrets.GITLAB_PRIVATE_TOKEN}}
TAP_GITLAB_GROUPS: meltano/infra
TAP_GITLAB_PROJECTS: meltano/demo-project
TAP_GITLAB_ULTIMATE_LICENSE: false
TAP_GITLAB_FETCH_MERGE_REQUEST_COMMITS: false
TAP_GITLAB_FETCH_PIPELINES_EXTENDED: false
TAP_GITLAB_FETCH_GROUP_VARIABLES: false
TAP_GITLAB_FETCH_SITE_USERS: false
TAP_GITLAB_FETCH_PROJECT_VARIABLES: false
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Secrets and internal config files
**/.secrets/*

# API cache
api_caches

# Ignore meltano internal cache and sqlite systemdb
.meltano/

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Built with the [Meltano SDK](https://sdk.meltano.com) for Singer Taps and Target

| Setting | Required | Default | Description |
|:---------------------------|:--------:|:-------:|:------------|
| api_url | False | None | Optionally overrides the default base URL for the Gitlab API. |
| api_url | False | None | Optionally overrides the default base URL for the Gitlab API. If no path is provided, the base URL will be appended with `/api/v4`. E.g. 'https://gitlab.com' becomes 'https://gitlab.com/api/v4'. |
| private_token | True | None | An access token to use when calling to the Gitlab API. |
| groups | False | None | A space delimited list of group ids, e.g. 'orgname1 orgname2 orgname3' |
| projects | False | None | A space delimited list of project ids, e.g. 'orgname/projectname1 orgname/projectname2 |
Expand Down
20 changes: 14 additions & 6 deletions meltano.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: 1
send_anonymous_usage_stats: true
project_id: tap-gitlab
project_id: tap-gitlab--meltanolabs
plugins:
extractors:
- name: tap-gitlab
Expand All @@ -12,11 +12,7 @@ plugins:
- discover
- about
- stream-maps
- schema-flattening
config:
start_date: '2010-01-01T00:00:00Z'
settings:
# TODO: To configure using Meltano, declare settings and their types here:
- name: api_url
kind: string
- name: private_token
Expand Down Expand Up @@ -45,8 +41,20 @@ plugins:
kind: boolean
- name: flattening_max_depth
kind: integer

- name: requests_cache_path
kind: string
config:
projects: meltano/demo-project meltano/meltano
start_date: '2022-03-01T00:00:00Z'
requests_cache_path: ./api_caches
select:
- '*.*'
- '!jobs' # Very slow
- '!pipelines_extended' # Very slow
loaders:
- name: target-jsonl
variant: andyh1203
pip_url: target-jsonl
- name: target-sqlite
variant: meltanolabs
pip_url: git+https://github.com/MeltanoLabs/target-sqlite.git
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tap-gitlab"
version = "0.0.1"
version = "2.0.0-alpha3"
description = "`tap-gitlab` is a Singer tap for GitLab, built with the Meltano SDK for Singer Taps."
authors = ["Meltano and Meltano Community"]
keywords = [
Expand Down
17 changes: 13 additions & 4 deletions tap_gitlab/caching.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
"""Test suite for tap-github."""

import logging
import os

import requests_cache

from tap_gitlab.client import API_TOKEN_KEY


def setup_requests_cache(tap_config: dict) -> None:
def setup_requests_cache(tap_config: dict, logger: logging.Logger) -> None:
"""Install the caching mechanism for requests."""
cache_path_root = tap_config.get("requests_cache_path", None)
if not cache_path_root:
return None
return

num_files = 0
if os.path.exists(cache_path_root):
num_files = len(os.listdir(cache_path_root))

# recording = tap_config.get("requests_recording_enabled", False)
# TODO: leverage `recording` to enable/disable the below
logger.info(
f"Request caching is enabled at '{cache_path_root}'. "
f"Found {num_files:,} cache resources during setup."
)

requests_cache.install_cache(
cache_path_root,
Expand Down
90 changes: 64 additions & 26 deletions tap_gitlab/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

import copy
import urllib
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, cast
from typing import Any, Dict, List, Optional, Union, cast
from urllib.parse import urlparse

import requests
from singer_sdk.authenticators import APIKeyAuthenticator
Expand All @@ -22,13 +26,30 @@ class GitLabStream(RESTStream):

records_jsonpath = "$[*]"
next_page_token_jsonpath = "$.X-Next-Page"
extra_url_params: dict = {}
extra_url_params: Optional[dict] = None
bookmark_param_name = "since"
_LOG_REQUEST_METRIC_URLS = True # Okay to print in logs
# sensitive_request_path = False # TODO: Update SDK to accept this instead.

@property
def url_base(self) -> str:
"""Return the API URL root, configurable via tap settings."""
return self.config.get("api_url", DEFAULT_API_URL)
"""Return the API URL root, configurable via tap settings.
If no path is provided, the base URL will be appended with `/api/v4`.
E.g. 'https://gitlab.com' would become 'https://gitlab.com/api/v4'
Note: trailing slashes ('/') are scrubbed prior to comparison, so that
'https://gitlab.com` is equivalent to 'https://gitlab.com/' and
'https://gitlab.com/api/v4' is equivalent to 'https://gitlab.com/api/v4/'.
"""
# Remove trailing '/' from url base.
result = self.config.get("api_url", DEFAULT_API_URL).rstrip("/")

# If path part is not provided, append the v4 endpoint as default:
# For example 'https://gitlab.com' => 'https://gitlab.com/api/v4'
if not urlparse(result).path:
result += "/api/v4"
return result

@property
def schema_filename(self) -> str:
Expand Down Expand Up @@ -63,7 +84,8 @@ def get_url_params(
) -> Dict[str, Any]:
"""Return a dictionary of values to be used in URL parameterization."""
# If the class has extra default params, start with those:
params: dict = self.extra_url_params
# TODO: SDK Bug: without copy(), this will leak params across classes/objects.
params: dict = copy.copy(self.extra_url_params or {})

if next_page_token:
params["page"] = next_page_token
Expand All @@ -81,32 +103,48 @@ def get_next_page_token(
"""Return token for identifying next page or None if not applicable."""
return response.headers.get("X-Next-Page", None)

@staticmethod
def _url_encode(val: Union[str, datetime, bool, int, List[str]]) -> str:
"""Encode the val argument as url-compatible string."""
return urllib.parse.quote_plus(str(val))

class ProjectBasedStream(GitLabStream):
"""Base class for streams that are keys based on project ID."""
def get_url(self, context: Optional[dict]) -> str:
"""Get stream entity URL."""
url = "".join([self.url_base, self.path or ""])
vals = copy.copy(dict(self.config))
vals.update(context or {})
for key, val in vals.items():
search_text = "".join(["{", key, "}"])
if search_text in url:
url = url.replace(search_text, self._url_encode(val))
if "{project_path}" in search_text:
self.logger.info(
f"DEBUG: Found project arg. URL is {url} after parsing "
f"input val '{val}' to '{self._url_encode(val)}'."
)

state_partitioning_keys = ["project_path"]
return url

@property
def partitions(self) -> List[dict]:
"""Return a list of partition key dicts (if applicable), otherwise None."""
if "{project_path}" in self.path:
if "projects" not in self.config:
raise ValueError(
f"Missing `projects` setting which is required for the "
f"'{self.name}' stream."
)
def post_process(self, row: dict, context: Optional[dict] = None) -> Optional[dict]:
"""Post process records."""
result = super().post_process(row, context)
del row
if result is None:
return None

return [
{"project_path": id}
for id in cast(list, self.config["projects"].split(" "))
]
assert context is not None # Tell linter that context is non-null

raise ValueError(
"Could not detect partition type for Gitlab stream "
f"'{self.name}' ({self.path}). "
"Expected a URL path containing '{project_path}' or '{group_path}'. "
)
for key, val in context.items():
if key in self.schema.get("properties", {}) and key not in result:
result[key] = val

return result


class ProjectBasedStream(GitLabStream):
"""Base class for streams that are keys based on project ID."""

state_partitioning_keys = ["project_path"]


class GroupBasedStream(GitLabStream):
Expand Down
Loading

0 comments on commit 7d285b8

Please sign in to comment.