Skip to content

PyPI Data Pipeline #305

PyPI Data Pipeline

PyPI Data Pipeline #305

name: PyPI Data Pipeline
permissions:
id-token: write
on:
workflow_dispatch:
inputs:
database_name:
description: 'Database name'
required: false
default: 'duckdb_stats'
type: string
job_type:
description: 'Job to run'
required: true
default: 'all'
type: choice
options:
- ingest
- transform
- all
- update
start_date:
description: 'Start Date'
required: false
default: ''
end_date:
description: 'End Date'
required: false
default: ''
schedule:
- cron: '0 0 * * *'
jobs:
setup:
runs-on: ubuntu-latest
outputs:
start_date: ${{ steps.dates.outputs.START_DATE }}
end_date: ${{ steps.dates.outputs.END_DATE }}
run_ingest: ${{ steps.set-flags.outputs.run_ingest }}
run_transform: ${{ steps.set-flags.outputs.run_transform }}
run_update: ${{ steps.set-flags.outputs.run_update }}
steps:
- name: Calculate Dates
id: dates
run: |
if [ -n "${{ github.event.inputs.start_date }}" ] && [ -n "${{ github.event.inputs.end_date }}" ]; then
echo "START_DATE=${{ github.event.inputs.start_date }}" >> $GITHUB_OUTPUT
echo "END_DATE=${{ github.event.inputs.end_date }}" >> $GITHUB_OUTPUT
else
START_DATE=$(date -d '2 days ago' +%Y-%m-%d)
END_DATE=$(date -d '1 day ago' +%Y-%m-%d)
echo "START_DATE=${START_DATE}" >> $GITHUB_OUTPUT
echo "END_DATE=${END_DATE}" >> $GITHUB_OUTPUT
fi
- name: Set job flags
id: set-flags
run: |
if [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${{ github.event.inputs.job_type }}" == "all" ]]; then
echo "run_ingest=true" >> $GITHUB_OUTPUT
echo "run_transform=true" >> $GITHUB_OUTPUT
echo "run_update=true" >> $GITHUB_OUTPUT
elif [[ "${{ github.event.inputs.job_type }}" == "ingest" ]]; then
echo "run_ingest=true" >> $GITHUB_OUTPUT
echo "run_transform=false" >> $GITHUB_OUTPUT
echo "run_update=false" >> $GITHUB_OUTPUT
elif [[ "${{ github.event.inputs.job_type }}" == "transform" ]]; then
echo "run_ingest=false" >> $GITHUB_OUTPUT
echo "run_transform=true" >> $GITHUB_OUTPUT
echo "run_update=false" >> $GITHUB_OUTPUT
elif [[ "${{ github.event.inputs.job_type }}" == "update" ]]; then
echo "run_ingest=false" >> $GITHUB_OUTPUT
echo "run_transform=false" >> $GITHUB_OUTPUT
echo "run_update=true" >> $GITHUB_OUTPUT
fi
ingest-pypi-data:
needs: setup
runs-on: ubuntu-latest
if: needs.setup.outputs.run_ingest == 'true'
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
Makefile
sparse-checkout-cone-mode: false
- id: 'auth'
name: 'Authenticate to GCP'
uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SECRETS }}'
- name: Pull latest Docker image
run: docker pull ghcr.io/${{ github.repository }}:latest
- name: Ingest pypi data
env:
START_DATE: ${{ needs.setup.outputs.start_date }}
END_DATE: ${{ needs.setup.outputs.end_date }}
PYPI_PROJECT: duckdb
DATABASE_NAME: ${{ github.event.inputs.database_name || 'duckdb_stats' }}
GCP_PROJECT: ${{ secrets.GCP_PROJECT_ID }}
TIMESTAMP_COLUMN: timestamp
DESTINATION: md
DOCKER_IMAGE: ghcr.io/${{ github.repository }}:latest
AWS_PROFILE: None
S3_PATH: None
motherduck_token: ${{ secrets.MOTHERDUCK_TOKEN }}
GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.auth.outputs.credentials_file_path }}
run: |
make pypi-ingest DOCKER=true
transform-pypi-data:
needs: [setup, ingest-pypi-data]
runs-on: ubuntu-latest
if: |
always() &&
needs.setup.outputs.run_transform == 'true' &&
(needs.ingest-pypi-data.result == 'success' || needs.ingest-pypi-data.result == 'skipped')
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
Makefile
sparse-checkout-cone-mode: false
- id: 'auth'
name: 'Authenticate to GCP'
uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SECRETS }}'
- name: Pull latest Docker image
run: docker pull ghcr.io/${{ github.repository }}:latest
- name: Transform pypi data using dbt
env:
START_DATE: ${{ needs.setup.outputs.start_date }}
END_DATE: ${{ needs.setup.outputs.end_date }}
DATABASE_NAME: ${{ github.event.inputs.database_name || 'duckdb_stats' }}
DBT_TARGET: prod
DOCKER_IMAGE: ghcr.io/${{ github.repository }}:latest
motherduck_token: ${{ secrets.MOTHERDUCK_TOKEN }}
GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.auth.outputs.credentials_file_path }}
run: |
make pypi-transform DOCKER=true DBT_TARGET=${DBT_TARGET}
update-motherduck-share:
needs: transform-pypi-data
runs-on: ubuntu-latest
if: needs.transform-pypi-data.result == 'success' || needs.transform-pypi-data.result == 'skipped'
steps:
- name: Setup DuckDB
uses: opt-nc/[email protected]
with:
version: v1.1.0
- name: Update MotherDuck share
env:
motherduck_token: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_NAME: ${{ github.event.inputs.database_name || 'duckdb_stats' }}
run: |
echo "ATTACH 'md:';" >> ~/.duckdbrc && duckdb -c "UPDATE SHARE $DATABASE_NAME;"