Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notebook to assist with bringing data from BigQuery to Sheets #3652

Merged
merged 1 commit into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion sql/util/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@ This query generates a list of candidate URLs for manifest and service worker fi

The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables.

## green_web_foundation
## [green_web_foundation.sql](./green_web_foundation.sql)

1. Go to https://admin.thegreenwebfoundation.org/admin/green-urls
2. Scroll to the bottom for the latest database dump
3. Convert to a BQ-compatible format, ie CSV
4. Import into a temporary BQ table
5. Join with the date-partitioned `green_web_foundation` table

## [bq_sql_to_spreadsheet.ipynb](./bq_to_sheets.ipynb)

This Jupyter notebook runs BigQuery SQL queries for a chapter and saves the results to a Google Sheet. It uses the `gspread` library to interact with Google Sheets.
232 changes: 232 additions & 0 deletions sql/util/bq_to_sheets.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"# Almanac\n",
"CHAPTER = \"privacy\"\n",
"YEAR = \"2024\"\n",
"\n",
"# BigQuery\n",
"GCP_PROJECT = \"httparchive\"\n",
"\n",
"# Git\n",
"BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n",
" chapter=CHAPTER,\n",
" year=YEAR\n",
")\n",
"\n",
"# SQL folder\n",
"folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n",
" year=YEAR,\n",
" chapter=CHAPTER\n",
")\n",
"\n",
"# Google Sheets\n",
"spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n",
" chapter=CHAPTER.capitalize(),\n",
" year=YEAR\n",
")\n",
"\n",
"# Set to `None` to create new one or an existing spreadsheet URL.\n",
"existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'"
],
"metadata": {
"id": "U37785Bxt5tE"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OVkCxlRQH6Yt",
"outputId": "9fb31f97-8541-461a-991f-e7932da56101"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'almanac.httparchive.org'...\n",
"remote: Enumerating objects: 43942, done.\u001b[K\n",
"remote: Counting objects: 100% (5935/5935), done.\u001b[K\n",
"remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n",
"remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n",
"Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n",
"Resolving deltas: 100% (29622/29622), done.\n",
"Updating files: 100% (5472/5472), done.\n"
]
}
],
"source": [
"# Download repo\n",
"!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UzhgG5xvbQ1E",
"outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Already on 'privacy-sql-2024'\n",
"Your branch is up to date with 'origin/privacy-sql-2024'.\n",
"Already up to date.\n"
]
}
],
"source": [
"# Update local branch\n",
"!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "45dBifFPJAtO"
},
"outputs": [],
"source": [
"# Authenticate\n",
"import google.auth\n",
"import os\n",
"from google.colab import auth\n",
"from google.cloud import bigquery\n",
"\n",
"import gspread\n",
"from gspread_dataframe import set_with_dataframe\n",
"\n",
"os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n",
"auth.authenticate_user()\n",
"credentials, project = google.auth.default()\n",
"client = bigquery.Client()\n",
"gc = gspread.authorize(credentials)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nblNil985Tjt",
"outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n"
]
}
],
"source": [
"import glob\n",
"import re\n",
"\n",
"# Build Sheets\n",
"try:\n",
" ss = gc.open_by_url(existing_spreadsheet_url)\n",
" print('Using existing spreadsheet:', ss.url)\n",
"except:\n",
" ss = gc.create(spreadsheet_name)\n",
" print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n",
"existing_sheets = [s.title for s in ss.worksheets()]\n",
"\n",
"file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n",
" \"number_of_websites_with_origin_trial_from_token.sql\"\n",
"\n",
"file_match_exclude = r\"^$\"\n",
"\n",
"overwrite = False\n",
"dry_run = True\n",
"tb_processed_limit = 0.1\n",
"\n",
"# Find matching .sql queries in folder and save to google sheet.\n",
"for filepath in glob.iglob(folder):\n",
" filename = filepath.split('/')[-1]\n",
" sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n",
"\n",
" if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n",
"\n",
" print('Processing:', sheet_title)\n",
" with open(filepath) as f:\n",
" query = f.read()\n",
"\n",
" response = client.query(\n",
" query,\n",
" job_config = bigquery.QueryJobConfig(dry_run = True)\n",
" )\n",
"\n",
" tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n",
" print(f\"Total Tb billed:{tb_processed:9.3f}\")\n",
"\n",
" if dry_run:\n",
" continue\n",
"\n",
" if tb_processed > tb_processed_limit:\n",
" print('Data volume hit the limit. Skipping:', sheet_title)\n",
" continue\n",
"\n",
" if sheet_title in existing_sheets:\n",
" if not overwrite:\n",
" print('Overwrite is False. Skipping:', sheet_title)\n",
" continue\n",
"\n",
" else:\n",
" st = ss.worksheet(sheet_title)\n",
" ss.del_worksheet(st)\n",
"\n",
" df = client.query(query).to_dataframe()\n",
" rows, cols = df.shape\n",
"\n",
" st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n",
" set_with_dataframe(st, df)\n",
"\n",
" else:\n",
" print('Not Matched. Skipping:', sheet_title)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}