-
-
Notifications
You must be signed in to change notification settings - Fork 183
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2d080ed
commit 994a058
Showing
2 changed files
with
237 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Almanac\n", | ||
"CHAPTER = \"privacy\"\n", | ||
"YEAR = \"2024\"\n", | ||
"\n", | ||
"# BigQuery\n", | ||
"GCP_PROJECT = \"httparchive\"\n", | ||
"\n", | ||
"# Git\n", | ||
"BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n", | ||
" chapter=CHAPTER,\n", | ||
" year=YEAR\n", | ||
")\n", | ||
"\n", | ||
"# SQL folder\n", | ||
"folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", | ||
" year=YEAR,\n", | ||
" chapter=CHAPTER\n", | ||
")\n", | ||
"\n", | ||
"# Google Sheets\n", | ||
"spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n", | ||
" chapter=CHAPTER.capitalize(),\n", | ||
" year=YEAR\n", | ||
")\n", | ||
"\n", | ||
"# Set to `None` to create new one or an existing spreadsheet URL.\n", | ||
"existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'" | ||
], | ||
"metadata": { | ||
"id": "U37785Bxt5tE" | ||
}, | ||
"execution_count": 1, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "OVkCxlRQH6Yt", | ||
"outputId": "9fb31f97-8541-461a-991f-e7932da56101" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Cloning into 'almanac.httparchive.org'...\n", | ||
"remote: Enumerating objects: 43942, done.\u001b[K\n", | ||
"remote: Counting objects: 100% (5935/5935), done.\u001b[K\n", | ||
"remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n", | ||
"remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n", | ||
"Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n", | ||
"Resolving deltas: 100% (29622/29622), done.\n", | ||
"Updating files: 100% (5472/5472), done.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Download repo\n", | ||
"!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "UzhgG5xvbQ1E", | ||
"outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Already on 'privacy-sql-2024'\n", | ||
"Your branch is up to date with 'origin/privacy-sql-2024'.\n", | ||
"Already up to date.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Update local branch\n", | ||
"!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"id": "45dBifFPJAtO" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Authenticate\n", | ||
"import google.auth\n", | ||
"import os\n", | ||
"from google.colab import auth\n", | ||
"from google.cloud import bigquery\n", | ||
"\n", | ||
"import gspread\n", | ||
"from gspread_dataframe import set_with_dataframe\n", | ||
"\n", | ||
"os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", | ||
"auth.authenticate_user()\n", | ||
"credentials, project = google.auth.default()\n", | ||
"client = bigquery.Client()\n", | ||
"gc = gspread.authorize(credentials)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "nblNil985Tjt", | ||
"outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import glob\n", | ||
"import re\n", | ||
"\n", | ||
"# Build Sheets\n", | ||
"try:\n", | ||
" ss = gc.open_by_url(existing_spreadsheet_url)\n", | ||
" print('Using existing spreadsheet:', ss.url)\n", | ||
"except:\n", | ||
" ss = gc.create(spreadsheet_name)\n", | ||
" print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n", | ||
"existing_sheets = [s.title for s in ss.worksheets()]\n", | ||
"\n", | ||
"file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n", | ||
" \"number_of_websites_with_origin_trial_from_token.sql\"\n", | ||
"\n", | ||
"file_match_exclude = r\"^$\"\n", | ||
"\n", | ||
"overwrite = False\n", | ||
"dry_run = True\n", | ||
"tb_processed_limit = 0.1\n", | ||
"\n", | ||
"# Find matching .sql queries in folder and save to google sheet.\n", | ||
"for filepath in glob.iglob(folder):\n", | ||
" filename = filepath.split('/')[-1]\n", | ||
" sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n", | ||
"\n", | ||
" if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n", | ||
"\n", | ||
" print('Processing:', sheet_title)\n", | ||
" with open(filepath) as f:\n", | ||
" query = f.read()\n", | ||
"\n", | ||
" response = client.query(\n", | ||
" query,\n", | ||
" job_config = bigquery.QueryJobConfig(dry_run = True)\n", | ||
" )\n", | ||
"\n", | ||
" tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", | ||
" print(f\"Total Tb billed:{tb_processed:9.3f}\")\n", | ||
"\n", | ||
" if dry_run:\n", | ||
" continue\n", | ||
"\n", | ||
" if tb_processed > tb_processed_limit:\n", | ||
" print('Data volume hit the limit. Skipping:', sheet_title)\n", | ||
" continue\n", | ||
"\n", | ||
" if sheet_title in existing_sheets:\n", | ||
" if not overwrite:\n", | ||
" print('Overwrite is False. Skipping:', sheet_title)\n", | ||
" continue\n", | ||
"\n", | ||
" else:\n", | ||
" st = ss.worksheet(sheet_title)\n", | ||
" ss.del_worksheet(st)\n", | ||
"\n", | ||
" df = client.query(query).to_dataframe()\n", | ||
" rows, cols = df.shape\n", | ||
"\n", | ||
" st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n", | ||
" set_with_dataframe(st, df)\n", | ||
"\n", | ||
" else:\n", | ||
" print('Not Matched. Skipping:', sheet_title)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |