Add configuration for MATATIKA_WORKSPACES_HOME and MATATIKA_PLUGINS_HOME

Matatika · Dec 20, 2022 · 30c17c8 · 30c17c8
1 parent 54b1c21
commit 30c17c8
Show file tree

Hide file tree

Showing 6 changed files with 356 additions and 6 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -23,9 +23,10 @@ services:
       - PERSISTENCE_CATALOG_PASS=warehouse
       - PERSISTENCE_CATALOG_DRIVER_CLASS_NAME=org.postgresql.Driver
       - MATATIKA_DATAFLOW_NETWORK=matatika
-      - MATATIKA_DATAFLOW_VOLUME_MOUNTS=project:/project
+      - MATATIKA_DATAFLOW_VOLUME_MOUNTS=workspaces:/workspaces
       - ELASTICSEARCH_HOST=search
-      - MATATIKA_WORKSPACE_DIRS=/project
+      - MATATIKA_WORKSPACES_HOME=/workspaces
+      - MATATIKA_PLUGINS_HOME=file:/plugins
       # Generate a 256-bit (32-char) secret key and set it here. This is used to generate access tokens for the Matatika App and API.
       - MATATIKA_AUTH_LOCAL_SECRET=matatikaauthlocalsecret_changeme
     depends_on: {"search-setup": {"condition": "service_completed_successfully"}, "search": {"condition": "service_healthy"}, "db": {"condition": "service_healthy"}}
@@ -41,7 +42,8 @@ services:
         /docker-entrypoint.sh server"
     volumes:
       - /var/run/docker.sock:/var/run/docker.sock
-      - project:/project
+      - workspaces:/workspaces
+      - plugins:/plugins
 
   app:
     user: root
@@ -178,12 +180,19 @@ volumes:
   es-data:
     driver: local
   es-certs:
-  project:
-    name: project
+  workspaces:
+    name: workspaces
     driver: local
     driver_opts:
       type: none
-      device: ${PWD}
+      device: ${PWD}/workspaces
+      o: bind
+  plugins:
+    name: plugins
+    driver: local
+    driver_opts:
+      type: none
+      device: ${PWD}/plugins
       o: bind
 
 networks:

diff --git a/plugins/extractors/tap-github--meltanolabs.yml b/plugins/extractors/tap-github--meltanolabs.yml
@@ -0,0 +1,132 @@
+name: tap-github
+namespace: tap_github
+label: GitHub
+logo_url: /assets/images/datasource/tap-github.png
+description: |-
+  GitHub is an online software development platform. It's used for storing, tracking, and collaborating on software projects.
+
+  ## Prerequisites
+
+  You will need a Github `auth_token` scoped with everything you need access to. For help with this you can follow the Github [Creating a Personal Access Token Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+
+  You will also need to provide one of the following settings: 
+  - `Repositories`
+  - `Organizations`
+  - `Searches`
+  - `User Usernames`
+  - `User IDs`
+
+  ## Auth Token
+
+  Github token to authenticate with.
+
+  ## Additional Auth Tokens
+
+  List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits.
+
+  ## Rate Limit Buffer
+
+  Add a buffer to avoid consuming all query points for the token at hand.
+
+  ## Searches
+
+  An array of search descriptor objects with the following properties. "name" - a human readable name for the search query. "query" -  a github search string (generally the same as would come after ?q= in the URL).
+
+  ## Organizations
+
+  An array of strings containing the github organizations to be included.
+
+  Example: `["Matatika", "MatatikaBytes"]`
+
+  ## Repositories
+
+  An array of strings containing the github repos to be included.
+
+  Example: `["Matatika/tap-spotify", "Matatika/tap-auth0"]`
+
+  ## User Usernames
+
+  An array of strings containing GitHub usernames.
+
+  Example: `["user", "usertwo"]`
+
+  ## User IDs
+
+  A list of GitHub user ids.
+
+  Example: `["12345", "54321"]`
+
+  ## Start Date
+
+  When to start syncing data from.
+
+  ## Stream Maps
+
+  Optional stream maps.
+
+  ## Stream Map Config
+
+  Configuration for your stream maps.
+
+  ## Skip Parent Streams
+
+  Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are.
+
+  ## Learn more
+
+  [Matatika Github Data Source Documentation](${docsBaseUrl}/data-sources/tap-github)
+
+  [Meltano Hub Auth0 Documentation](https://hub.meltano.com/extractors/tap-github/)
+
+variant: meltanolabs
+docs: ${docsBaseUrl}/data-sources/tap-github/
+repo: https://github.com/MeltanoLabs/tap-github
+pip_url: git+https://github.com/MeltanoLabs/tap-github.git
+capabilities:
+  - catalog
+  - state
+  - discover
+  - about
+settings:
+  - label: User Agent
+    name: user_agent
+  - description: The log level of the API response metrics.
+    label: Metrics Log Level
+    name: metrics_log_level
+  - kind: password
+    label: Auth Token
+    name: auth_token
+    required: true
+  - kind: array
+    label: Additional Auth Tokens
+    name: additional_auth_tokens
+  - kind: integer
+    label: Rate Limit Buffer
+    name: rate_limit_buffer
+  - kind: array
+    label: Searches
+    name: searches
+  - kind: array
+    label: Organizations
+    name: organizations
+  - kind: array
+    label: Repositories
+    name: repositories
+  - kind: array
+    label: User Usernames
+    name: user_usernames
+  - kind: array
+    label: User IDs
+    name: user_ids
+  - kind: date_iso8601
+    label: Start Date
+    name: start_date
+  - kind: object
+    label: Stream Maps
+    name: stream_maps
+  - kind: object
+    label: Stream Map Config
+    name: stream_map_config
+  - kind: boolean
+    label: Skip Parent Streams
+    name: skip_parent_streams
diff --git a/plugins/files/analyze-github--matatika.yml b/plugins/files/analyze-github--matatika.yml
@@ -0,0 +1,15 @@
+name: analyze-github
+namespace: tap_github
+label: Github Insights
+logo_url: /assets/images/datasource/tap-github.png
+description: |-
+  Instant insights on commits, issues and PRs from Github.
+update:
+  "*.yml": true
+variant: matatika
+repo: https://github.com/Matatika/analyze-github
+pip_url: git+https://github.com/Matatika/analyze-github
+requires:
+  extractors:
+    - name: tap-github
+      variant: meltanolabs
diff --git a/plugins/loaders/target-postgres--transferwise.yml b/plugins/loaders/target-postgres--transferwise.yml
@@ -0,0 +1,143 @@
+name: target-postgres
+label: Postgres Warehouse
+description: |-
+  Loader for Postgres Data Warehouse
+
+  The World's Most Advanced Open Source Relational Database
+
+  ## Settings
+
+  ### User
+  Postgres user
+
+  ### Password
+  Postgres password
+
+  ### Host
+  Postgres host
+  Example: `test-host.db.elephantsql.com`
+
+  ### Port
+  Postgres port
+  Example: `5432`
+
+  ### Database Name
+  Postgres database name
+
+  ### Default Target Schema
+  Name of the schema where tables will be created (no database prefix)
+
+namespace: postgres_transferwise
+dialect: postgres
+target_schema: $TARGET_POSTGRES_DEFAULT_TARGET_SCHEMA
+logo_url: /assets/logos/loaders/postgres.png
+variant: transferwise
+repo: git+https://github.com/Matatika/[email protected]
+pip_url: git+https://github.com/Matatika/[email protected]
+settings:
+- name: user
+  aliases:
+    - username
+  description: PostgreSQL user
+  label: User
+  required: true
+- name: password
+  kind: password
+  description: PostgreSQL password
+  label: Password
+  required: true
+- name: host
+  aliases:
+    - address
+  description: PostgreSQL host
+  label: Host
+  required: true
+- name: port
+  kind: integer
+  value: 5432
+  description: PostgreSQL port
+  label: Port
+  required: true
+- name: dbname
+  aliases:
+    - database
+  description: PostgreSQL database name
+  label: Database Name
+  required: true
+- name: default_target_schema
+  value: analytics
+  description: Name of the schema where the tables will be created.
+  label: Default Target Schema
+  required: true
+- name: ssl
+  kind: hidden
+  value: false
+  value_post_processor: stringify
+  label: SSL
+- name: batch_size_rows
+  kind: integer
+  value: 100000
+  description: Maximum number of rows in each batch. At the end of each batch, the rows in the batch are loaded into Postgres.
+  label: Batch Size Rows
+- name: underscore_camel_case_fields
+  kind: hidden
+  value: true
+  description: Enable underscoring camel case field names, make the resulting postgres column names the same as from the Meltano variant of target-postgres.
+  label: Underscore Camel Case Fields
+- name: flush_all_streams
+  kind: hidden
+  value: false
+  description: "Flush and load every stream into Postgres when one batch is full. Warning: This may trigger the COPY command to use files with low number of records."
+  label: Flush All Streams
+- name: parallelism
+  kind: hidden
+  value: 0
+  description: The number of threads used to flush tables. 0 will create a thread for each stream, up to parallelism_max. -1 will create a thread for each CPU core. Any other positive number will create that number of threads, up to parallelism_max.
+  label: Parallelism
+- name: parallelism_max
+  kind: hidden
+  value: 16
+  description: Max number of parallel threads to use when flushing tables.
+  label: Max Parallelism
+- name: default_target_schema_select_permission
+  kind: hidden
+  description: Grant USAGE privilege on newly created schemas and grant SELECT privilege on newly created tables to a specific role or a list of roles. If `schema_mapping` is not defined then every stream sent by the tap is granted accordingly.
+  label: Default Target Schema Select Permission
+- name: schema_mapping
+  kind: hidden
+  description: |
+    Useful if you want to load multiple streams from one tap to multiple Postgres schemas.
+
+    If the tap sends the `stream_id` in `<schema_name>-<table_name>` format then this option overwrites the `default_target_schema` value.
+    Note, that using `schema_mapping` you can overwrite the `default_target_schema_select_permission` value to grant SELECT permissions to different groups per schemas or optionally you can create indices automatically for the replicated tables.
+  label: Schema Mapping
+- name: add_metadata_columns
+  kind: hidden
+  value: true
+  description: Metadata columns add extra row level information about data ingestions, (i.e. when was the row read in source, when was inserted or deleted in postgres etc.) Metadata columns are creating automatically by adding extra columns to the tables with a column prefix `_SDC_`. The column names are following the stitch naming conventions documented at https://www.stitchdata.com/docs/data-structure/integration-schemas#sdc-columns. Enabling metadata columns will flag the deleted rows by setting the `_SDC_DELETED_AT` metadata column. Without the `add_metadata_columns` option the deleted rows from singer taps will not be recongisable in Postgres.
+  label: Add Metadata Columns
+- name: hard_delete
+  kind: hidden
+  value: false
+  description: When `hard_delete` option is true then DELETE SQL commands will be performed in Postgres to delete rows in tables. It's achieved by continuously checking the `_SDC_DELETED_AT` metadata column sent by the singer tap. Due to deleting rows requires metadata columns, `hard_delete` option automatically enables the `add_metadata_columns` option as well.
+  label: Hard Delete
+- name: data_flattening_max_level
+  kind: hidden
+  value: 10
+  description: Object type RECORD items from taps can be transformed to flattened columns by creating columns automatically. When value is 0 (default) then flattening functionality is turned off.
+  label: Data Flattening Max Level
+- name: primary_key_required
+  kind: boolean
+  value: false
+  description: Log based and Incremental replications on tables with no Primary Key cause duplicates when merging UPDATE events. When set to true, stop loading data if no Primary Key is defined.
+  label: Primary Key Required
+- name: validate_records
+  kind: boolean
+  value: false
+  description: Validate every single record message to the corresponding JSON schema. This option is disabled by default and invalid RECORD messages will fail only at load time by Postgres. Enabling this option will detect invalid records earlier but could cause performance degradation.
+  label: Validate Records
+- name: temp_dir
+  kind: hidden
+  hidden: true
+  description: "(Default: platform-dependent) Directory of temporary CSV files with RECORD messages."
+  label: Temporary Directory
diff --git a/plugins/transformers/dbt--dbt-labs.yml b/plugins/transformers/dbt--dbt-labs.yml
@@ -0,0 +1,51 @@
+name: dbt
+label: dbt
+namespace: dbt
+variant: dbt-labs
+logo_url: /assets/images/transformer/dbt.png
+docs: https://meltano.com/docs/transforms.html
+repo: https://github.com/dbt-labs/dbt-core
+requires:
+  files:
+  - name: files-dbt
+    variant: matatika
+pip_url: >
+  dbt-core~=1.3.0
+  dbt-postgres~=1.3.0
+  dbt-snowflake~=1.3.0
+settings:
+  - name: project_dir
+    value: $MELTANO_PROJECT_ROOT/transform
+  - name: profiles_dir
+    env: DBT_PROFILES_DIR
+    value: $MELTANO_PROJECT_ROOT/transform/profile
+  - name: target
+    value: $MELTANO_LOAD__DIALECT
+  - name: source_schema
+    value: $MELTANO_LOAD__TARGET_SCHEMA
+  - name: target_schema
+    value: analytics
+  - name: models
+    value: $MELTANO_TRANSFORM__PACKAGE_NAME $MELTANO_EXTRACTOR_NAMESPACE my_meltano_project
+commands:
+  clean:
+    args: clean
+    description: Delete all folders in the clean-targets list (usually the dbt_modules and target directories.)
+  compile:
+    args: compile
+    description: Generates executable SQL from source model, test, and analysis files. Compiled SQL files are written to the target/ directory.
+  deps:
+    args: deps
+    description: Pull the most recent version of the dependencies listed in packages.yml
+  run:
+    args: run
+    description: Compile SQL and execute against the current target database.
+  seed:
+    args: seed
+    description: Load data from csv files into your data warehouse.
+  snapshot:
+    args: snapshot
+    description: Execute snapshots defined in your project.
+  test:
+    args: test
+    description: Runs tests on data in deployed models.
diff --git a/workspaces/.gitignore b/workspaces/.gitignore