diff --git a/sql/util/README.md b/sql/util/README.md index 66814dd7060..bec82afec05 100644 --- a/sql/util/README.md +++ b/sql/util/README.md @@ -24,10 +24,14 @@ This query generates a list of candidate URLs for manifest and service worker fi The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables. -## green_web_foundation +## [green_web_foundation.sql](./green_web_foundation.sql) 1. Go to https://admin.thegreenwebfoundation.org/admin/green-urls 2. Scroll to the bottom for the latest database dump 3. Convert to a BQ-compatible format, ie CSV 4. Import into a temporary BQ table 5. Join with the date-partitioned `green_web_foundation` table + +## [bq_sql_to_spreadsheet.ipynb](./bq_to_sheets.ipynb) + +This Jupyter notebook runs BigQuery SQL queries for a chapter and saves the results to a Google Sheet. It uses the `gspread` library to interact with Google Sheets. diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb new file mode 100644 index 00000000000..72896a9080a --- /dev/null +++ b/sql/util/bq_to_sheets.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "# Almanac\n", + "CHAPTER = \"privacy\"\n", + "YEAR = \"2024\"\n", + "\n", + "# BigQuery\n", + "GCP_PROJECT = \"httparchive\"\n", + "\n", + "# Git\n", + "BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n", + " chapter=CHAPTER,\n", + " year=YEAR\n", + ")\n", + "\n", + "# SQL folder\n", + "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", + " year=YEAR,\n", + " chapter=CHAPTER\n", + ")\n", + "\n", + "# Google Sheets\n", + "spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n", + " chapter=CHAPTER.capitalize(),\n", + " year=YEAR\n", + ")\n", + "\n", + "# Set to `None` to create new one or an existing spreadsheet URL.\n", + "existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'" + ], + "metadata": { + "id": "U37785Bxt5tE" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OVkCxlRQH6Yt", + "outputId": "9fb31f97-8541-461a-991f-e7932da56101" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'almanac.httparchive.org'...\n", + "remote: Enumerating objects: 43942, done.\u001b[K\n", + "remote: Counting objects: 100% (5935/5935), done.\u001b[K\n", + "remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n", + "remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n", + "Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n", + "Resolving deltas: 100% (29622/29622), done.\n", + "Updating files: 100% (5472/5472), done.\n" + ] + } + ], + "source": [ + "# Download repo\n", + "!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UzhgG5xvbQ1E", + "outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Already on 'privacy-sql-2024'\n", + "Your branch is up to date with 'origin/privacy-sql-2024'.\n", + "Already up to date.\n" + ] + } + ], + "source": [ + "# Update local branch\n", + "!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "45dBifFPJAtO" + }, + "outputs": [], + "source": [ + "# Authenticate\n", + "import google.auth\n", + "import os\n", + "from google.colab import auth\n", + "from google.cloud import bigquery\n", + "\n", + "import gspread\n", + "from gspread_dataframe import set_with_dataframe\n", + "\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", + "auth.authenticate_user()\n", + "credentials, project = google.auth.default()\n", + "client = bigquery.Client()\n", + "gc = gspread.authorize(credentials)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nblNil985Tjt", + "outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n" + ] + } + ], + "source": [ + "import glob\n", + "import re\n", + "\n", + "# Build Sheets\n", + "try:\n", + " ss = gc.open_by_url(existing_spreadsheet_url)\n", + " print('Using existing spreadsheet:', ss.url)\n", + "except:\n", + " ss = gc.create(spreadsheet_name)\n", + " print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n", + "existing_sheets = [s.title for s in ss.worksheets()]\n", + "\n", + "file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n", + " \"number_of_websites_with_origin_trial_from_token.sql\"\n", + "\n", + "file_match_exclude = r\"^$\"\n", + "\n", + "overwrite = False\n", + "dry_run = True\n", + "tb_processed_limit = 0.1\n", + "\n", + "# Find matching .sql queries in folder and save to google sheet.\n", + "for filepath in glob.iglob(folder):\n", + " filename = filepath.split('/')[-1]\n", + " sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n", + "\n", + " if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n", + "\n", + " print('Processing:', sheet_title)\n", + " with open(filepath) as f:\n", + " query = f.read()\n", + "\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(dry_run = True)\n", + " )\n", + "\n", + " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", + " print(f\"Total Tb billed:{tb_processed:9.3f}\")\n", + "\n", + " if dry_run:\n", + " continue\n", + "\n", + " if tb_processed > tb_processed_limit:\n", + " print('Data volume hit the limit. Skipping:', sheet_title)\n", + " continue\n", + "\n", + " if sheet_title in existing_sheets:\n", + " if not overwrite:\n", + " print('Overwrite is False. Skipping:', sheet_title)\n", + " continue\n", + "\n", + " else:\n", + " st = ss.worksheet(sheet_title)\n", + " ss.del_worksheet(st)\n", + "\n", + " df = client.query(query).to_dataframe()\n", + " rows, cols = df.shape\n", + "\n", + " st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n", + " set_with_dataframe(st, df)\n", + "\n", + " else:\n", + " print('Not Matched. Skipping:', sheet_title)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}