Skip to content

Populate sqlite-vec index #1410

Populate sqlite-vec index

Populate sqlite-vec index #1410

Workflow file for this run

name: Build README and deploy Datasette
on:
push:
branches:
- main
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
# We need full history to introspect created/updated:
with:
fetch-depth: 0
path: main
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: actions/cache@v3
name: Configure pip caching
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Cache Playwright browsers
uses: actions/cache@v3
with:
path: ~/.cache/ms-playwright/
key: ${{ runner.os }}-browsers
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r main/requirements.txt
- name: Install Playwright dependencies
run: |
shot-scraper install
- name: Download previous database unless REBUILD in commit message
if: |-
!contains(github.event.head_commit.message, 'REBUILD')
run: curl --fail -o main/tils.db https://s3.amazonaws.com/til.simonwillison.net/tils.db
continue-on-error: true
- name: Build database
env:
MARKDOWN_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |-
cd main
python build_database.py
- name: Soundness check
run: |-
cd main
datasette . --get / | grep "Simon Willison: TIL"
- name: Generate missing screenshots
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |-
cd main
python generate_screenshots.py
sqlite-utils vacuum tils.db
- name: Calculate embeddings and document similarity
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |-
cd main
# Fetch embeddings for documents that need them
openai-to-sqlite embeddings tils.db \
--sql 'select path, title, topic, body from til'
# Now calculate and save similarities
if sqlite-utils rows tils.db similarities --limit 1; then
# Table exists already, so only calculate new similarities
openai-to-sqlite similar tils.db \
$(git diff --name-only HEAD~10 HEAD | grep '/.*\.md$' | sed 's/\//_/g') \
--save --recalculate-for-matches --print
else
# Table does not exist, calculate for everything
openai-to-sqlite similar tils.db --all --save
fi
- name: Create sqlite-vec index for embeddings
run: |-
cd main
sqlite-utils tils.db 'create virtual table vec_tils using vec0(
embedding float[1536]
);'
sqlite-utils tils.db 'insert into vec_tils(rowid, embedding)
select rowid, embedding from embeddings;
'
- name: Update README
run: |-
cd main
python update_readme.py --rewrite
cat README.md
- name: Commit and push if README changed
run: |-
cd main
git diff
git config --global user.email "[email protected]"
git config --global user.name "README-bot"
git diff --quiet || (git add README.md && git commit -m "Updated README")
git push
- name: Upload latest tils.db to the S3 bucket
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |-
s3-credentials put-object til.simonwillison.net tils.db main/tils.db
- name: Install Fly
run: |
curl -L https://fly.io/install.sh | sh
- name: Deploy Datasette using Fly
env:
FLY_API_TOKEN: ${{ secrets.FLY_TOKEN }}
run: |-
cd main
PATH=$PATH:/home/runner/.fly/bin/ datasette publish fly tils.db \
--app simonw-tils \
--metadata metadata.yaml \
--static static:static \
--install datasette-template-sql \
--install "datasette-sitemap>=1.0" \
--install "datasette-atom>=0.7" \
--install datasette-json-html \
--install beautifulsoup4 \
--install "datasette-debug-asgi>=1.1" \
--install "datasette-graphql>=2.2" \
--install datasette-block-robots \
--install datasette-llm-embed \
--install datasette-sqlite-vec \
--plugins-dir plugins \
--template-dir templates