Skip to content

Commit

Permalink
Add configuration for MATATIKA_WORKSPACES_HOME and MATATIKA_PLUGINS_HOME
Browse files Browse the repository at this point in the history
  • Loading branch information
aphethean1 committed Dec 20, 2022
1 parent 54b1c21 commit 30c17c8
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 6 deletions.
21 changes: 15 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ services:
- PERSISTENCE_CATALOG_PASS=warehouse
- PERSISTENCE_CATALOG_DRIVER_CLASS_NAME=org.postgresql.Driver
- MATATIKA_DATAFLOW_NETWORK=matatika
- MATATIKA_DATAFLOW_VOLUME_MOUNTS=project:/project
- MATATIKA_DATAFLOW_VOLUME_MOUNTS=workspaces:/workspaces
- ELASTICSEARCH_HOST=search
- MATATIKA_WORKSPACE_DIRS=/project
- MATATIKA_WORKSPACES_HOME=/workspaces
- MATATIKA_PLUGINS_HOME=file:/plugins
# Generate a 256-bit (32-char) secret key and set it here. This is used to generate access tokens for the Matatika App and API.
- MATATIKA_AUTH_LOCAL_SECRET=matatikaauthlocalsecret_changeme
depends_on: {"search-setup": {"condition": "service_completed_successfully"}, "search": {"condition": "service_healthy"}, "db": {"condition": "service_healthy"}}
Expand All @@ -41,7 +42,8 @@ services:
/docker-entrypoint.sh server"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- project:/project
- workspaces:/workspaces
- plugins:/plugins

app:
user: root
Expand Down Expand Up @@ -178,12 +180,19 @@ volumes:
es-data:
driver: local
es-certs:
project:
name: project
workspaces:
name: workspaces
driver: local
driver_opts:
type: none
device: ${PWD}
device: ${PWD}/workspaces
o: bind
plugins:
name: plugins
driver: local
driver_opts:
type: none
device: ${PWD}/plugins
o: bind

networks:
Expand Down
132 changes: 132 additions & 0 deletions plugins/extractors/tap-github--meltanolabs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
name: tap-github
namespace: tap_github
label: GitHub
logo_url: /assets/images/datasource/tap-github.png
description: |-
GitHub is an online software development platform. It's used for storing, tracking, and collaborating on software projects.
## Prerequisites
You will need a Github `auth_token` scoped with everything you need access to. For help with this you can follow the Github [Creating a Personal Access Token Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
You will also need to provide one of the following settings:
- `Repositories`
- `Organizations`
- `Searches`
- `User Usernames`
- `User IDs`
## Auth Token
Github token to authenticate with.
## Additional Auth Tokens
List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits.
## Rate Limit Buffer
Add a buffer to avoid consuming all query points for the token at hand.
## Searches
An array of search descriptor objects with the following properties. "name" - a human readable name for the search query. "query" - a github search string (generally the same as would come after ?q= in the URL).
## Organizations
An array of strings containing the github organizations to be included.
Example: `["Matatika", "MatatikaBytes"]`
## Repositories
An array of strings containing the github repos to be included.
Example: `["Matatika/tap-spotify", "Matatika/tap-auth0"]`
## User Usernames
An array of strings containing GitHub usernames.
Example: `["user", "usertwo"]`
## User IDs
A list of GitHub user ids.
Example: `["12345", "54321"]`
## Start Date
When to start syncing data from.
## Stream Maps
Optional stream maps.
## Stream Map Config
Configuration for your stream maps.
## Skip Parent Streams
Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are.
## Learn more
[Matatika Github Data Source Documentation](${docsBaseUrl}/data-sources/tap-github)
[Meltano Hub Auth0 Documentation](https://hub.meltano.com/extractors/tap-github/)
variant: meltanolabs
docs: ${docsBaseUrl}/data-sources/tap-github/
repo: https://github.com/MeltanoLabs/tap-github
pip_url: git+https://github.com/MeltanoLabs/tap-github.git
capabilities:
- catalog
- state
- discover
- about
settings:
- label: User Agent
name: user_agent
- description: The log level of the API response metrics.
label: Metrics Log Level
name: metrics_log_level
- kind: password
label: Auth Token
name: auth_token
required: true
- kind: array
label: Additional Auth Tokens
name: additional_auth_tokens
- kind: integer
label: Rate Limit Buffer
name: rate_limit_buffer
- kind: array
label: Searches
name: searches
- kind: array
label: Organizations
name: organizations
- kind: array
label: Repositories
name: repositories
- kind: array
label: User Usernames
name: user_usernames
- kind: array
label: User IDs
name: user_ids
- kind: date_iso8601
label: Start Date
name: start_date
- kind: object
label: Stream Maps
name: stream_maps
- kind: object
label: Stream Map Config
name: stream_map_config
- kind: boolean
label: Skip Parent Streams
name: skip_parent_streams
15 changes: 15 additions & 0 deletions plugins/files/analyze-github--matatika.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: analyze-github
namespace: tap_github
label: Github Insights
logo_url: /assets/images/datasource/tap-github.png
description: |-
Instant insights on commits, issues and PRs from Github.
update:
"*.yml": true
variant: matatika
repo: https://github.com/Matatika/analyze-github
pip_url: git+https://github.com/Matatika/analyze-github
requires:
extractors:
- name: tap-github
variant: meltanolabs
143 changes: 143 additions & 0 deletions plugins/loaders/target-postgres--transferwise.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
name: target-postgres
label: Postgres Warehouse
description: |-
Loader for Postgres Data Warehouse
The World's Most Advanced Open Source Relational Database
## Settings
### User
Postgres user
### Password
Postgres password
### Host
Postgres host
Example: `test-host.db.elephantsql.com`
### Port
Postgres port
Example: `5432`
### Database Name
Postgres database name
### Default Target Schema
Name of the schema where tables will be created (no database prefix)
namespace: postgres_transferwise
dialect: postgres
target_schema: $TARGET_POSTGRES_DEFAULT_TARGET_SCHEMA
logo_url: /assets/logos/loaders/postgres.png
variant: transferwise
repo: git+https://github.com/Matatika/[email protected]
pip_url: git+https://github.com/Matatika/[email protected]
settings:
- name: user
aliases:
- username
description: PostgreSQL user
label: User
required: true
- name: password
kind: password
description: PostgreSQL password
label: Password
required: true
- name: host
aliases:
- address
description: PostgreSQL host
label: Host
required: true
- name: port
kind: integer
value: 5432
description: PostgreSQL port
label: Port
required: true
- name: dbname
aliases:
- database
description: PostgreSQL database name
label: Database Name
required: true
- name: default_target_schema
value: analytics
description: Name of the schema where the tables will be created.
label: Default Target Schema
required: true
- name: ssl
kind: hidden
value: false
value_post_processor: stringify
label: SSL
- name: batch_size_rows
kind: integer
value: 100000
description: Maximum number of rows in each batch. At the end of each batch, the rows in the batch are loaded into Postgres.
label: Batch Size Rows
- name: underscore_camel_case_fields
kind: hidden
value: true
description: Enable underscoring camel case field names, make the resulting postgres column names the same as from the Meltano variant of target-postgres.
label: Underscore Camel Case Fields
- name: flush_all_streams
kind: hidden
value: false
description: "Flush and load every stream into Postgres when one batch is full. Warning: This may trigger the COPY command to use files with low number of records."
label: Flush All Streams
- name: parallelism
kind: hidden
value: 0
description: The number of threads used to flush tables. 0 will create a thread for each stream, up to parallelism_max. -1 will create a thread for each CPU core. Any other positive number will create that number of threads, up to parallelism_max.
label: Parallelism
- name: parallelism_max
kind: hidden
value: 16
description: Max number of parallel threads to use when flushing tables.
label: Max Parallelism
- name: default_target_schema_select_permission
kind: hidden
description: Grant USAGE privilege on newly created schemas and grant SELECT privilege on newly created tables to a specific role or a list of roles. If `schema_mapping` is not defined then every stream sent by the tap is granted accordingly.
label: Default Target Schema Select Permission
- name: schema_mapping
kind: hidden
description: |
Useful if you want to load multiple streams from one tap to multiple Postgres schemas.
If the tap sends the `stream_id` in `<schema_name>-<table_name>` format then this option overwrites the `default_target_schema` value.
Note, that using `schema_mapping` you can overwrite the `default_target_schema_select_permission` value to grant SELECT permissions to different groups per schemas or optionally you can create indices automatically for the replicated tables.
label: Schema Mapping
- name: add_metadata_columns
kind: hidden
value: true
description: Metadata columns add extra row level information about data ingestions, (i.e. when was the row read in source, when was inserted or deleted in postgres etc.) Metadata columns are creating automatically by adding extra columns to the tables with a column prefix `_SDC_`. The column names are following the stitch naming conventions documented at https://www.stitchdata.com/docs/data-structure/integration-schemas#sdc-columns. Enabling metadata columns will flag the deleted rows by setting the `_SDC_DELETED_AT` metadata column. Without the `add_metadata_columns` option the deleted rows from singer taps will not be recongisable in Postgres.
label: Add Metadata Columns
- name: hard_delete
kind: hidden
value: false
description: When `hard_delete` option is true then DELETE SQL commands will be performed in Postgres to delete rows in tables. It's achieved by continuously checking the `_SDC_DELETED_AT` metadata column sent by the singer tap. Due to deleting rows requires metadata columns, `hard_delete` option automatically enables the `add_metadata_columns` option as well.
label: Hard Delete
- name: data_flattening_max_level
kind: hidden
value: 10
description: Object type RECORD items from taps can be transformed to flattened columns by creating columns automatically. When value is 0 (default) then flattening functionality is turned off.
label: Data Flattening Max Level
- name: primary_key_required
kind: boolean
value: false
description: Log based and Incremental replications on tables with no Primary Key cause duplicates when merging UPDATE events. When set to true, stop loading data if no Primary Key is defined.
label: Primary Key Required
- name: validate_records
kind: boolean
value: false
description: Validate every single record message to the corresponding JSON schema. This option is disabled by default and invalid RECORD messages will fail only at load time by Postgres. Enabling this option will detect invalid records earlier but could cause performance degradation.
label: Validate Records
- name: temp_dir
kind: hidden
hidden: true
description: "(Default: platform-dependent) Directory of temporary CSV files with RECORD messages."
label: Temporary Directory
51 changes: 51 additions & 0 deletions plugins/transformers/dbt--dbt-labs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: dbt
label: dbt
namespace: dbt
variant: dbt-labs
logo_url: /assets/images/transformer/dbt.png
docs: https://meltano.com/docs/transforms.html
repo: https://github.com/dbt-labs/dbt-core
requires:
files:
- name: files-dbt
variant: matatika
pip_url: >
dbt-core~=1.3.0
dbt-postgres~=1.3.0
dbt-snowflake~=1.3.0
settings:
- name: project_dir
value: $MELTANO_PROJECT_ROOT/transform
- name: profiles_dir
env: DBT_PROFILES_DIR
value: $MELTANO_PROJECT_ROOT/transform/profile
- name: target
value: $MELTANO_LOAD__DIALECT
- name: source_schema
value: $MELTANO_LOAD__TARGET_SCHEMA
- name: target_schema
value: analytics
- name: models
value: $MELTANO_TRANSFORM__PACKAGE_NAME $MELTANO_EXTRACTOR_NAMESPACE my_meltano_project
commands:
clean:
args: clean
description: Delete all folders in the clean-targets list (usually the dbt_modules and target directories.)
compile:
args: compile
description: Generates executable SQL from source model, test, and analysis files. Compiled SQL files are written to the target/ directory.
deps:
args: deps
description: Pull the most recent version of the dependencies listed in packages.yml
run:
args: run
description: Compile SQL and execute against the current target database.
seed:
args: seed
description: Load data from csv files into your data warehouse.
snapshot:
args: snapshot
description: Execute snapshots defined in your project.
test:
args: test
description: Runs tests on data in deployed models.
Empty file added workspaces/.gitignore
Empty file.

0 comments on commit 30c17c8

Please sign in to comment.